본문 바로가기

Data Science/Classification

[Models] Classification Models

Note : This is just tiny subsets of full modeling workflow. We must understand domian knowledge of our training datasets and do statistical analysis firstly.

 

import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_test.csv")


display(x_train.head())
display(y_train.head())

print(x_train.shape)
print(x_test.shape)

# Check missing values 

print(x_train.info())
print(x_test.info())

# Copy dataset and set index as ID 

train = x_train.copy().set_index('ID')
y = y_train.copy().set_index('ID').apply(lambda x: 1 if x['satisfaction'] == 'satisfied' else 0, axis = 1)
test = x_test.copy().set_index('ID')

# Drop column 'id'

train.drop(columns = 'id', inplace = True)
test.drop(columns = 'id', inplace = True)

# Fill missing values as median 

train['Arrival Delay in Minutes'].fillna(train['Arrival Delay in Minutes'].median(), inplace = True)
test['Arrival Delay in Minutes'].fillna(test['Arrival Delay in Minutes'].median(), inplace = True)

# Check object variables 

obj_cols = train.select_dtypes(include = ['object']).columns

for col in obj_cols : 
    print(f"{col} in Train")
    print(train[col].value_counts())
    print(train[col].unique())
    print(train[col].nunique())
    print('\n')
    print(test[col].value_counts())
    print(test[col].unique())
    print(test[col].nunique())
    print('\n')
    
    
# Check numerical variables 
display(train.describe())

# Deal outlier 

dep_q75 = train['Departure Delay in Minutes'].quantile(0.80)
arr_q75 = train['Arrival Delay in Minutes'].quantile(0.80)

train.loc[train['Departure Delay in Minutes'] > dep_q75, 'Departure Delay in Minutes'] = dep_q75
test.loc[test['Departure Delay in Minutes'] > dep_q75, 'Departure Delay in Minutes'] = dep_q75

train.loc[train['Arrival Delay in Minutes'] > arr_q75, 'Arrival Delay in Minutes'] = arr_q75
test.loc[test['Arrival Delay in Minutes'] > arr_q75, 'Arrival Delay in Minutes'] = arr_q75

# Preprocessing 

num_cols = train.select_dtypes(exclude = ['object']).columns 
obj_cols = train.select_dtypes(include = ['object']).columns
dct_cols = [col for col in num_cols if train[col].nunique() < 8]
cnt_cols = [col for col in num_cols if col not in dct_cols]

X = train[dct_cols + cnt_cols]
X_test = test[dct_cols + cnt_cols]

from sklearn.preprocessing import StandardScaler 

ss = StandardScaler()

X[cnt_cols] = ss.fit_transform(X[cnt_cols])
X_test[cnt_cols] = ss.transform(X_test[cnt_cols])

X_obj = pd.get_dummies(train[obj_cols])
X_test_obj = pd.get_dummies(test[obj_cols])

X = pd.concat([X, X_obj], axis = 1)
X_test = pd.concat([X_test, X_test_obj], axis = 1)

print(X.shape)
print(X_test.shape)

# Train Test Split 

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)

# Modeling 

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, y_train)

# Model Evaluation 

from sklearn.metrics import roc_auc_score

predicts = rf.predict(X_valid)
predict_proba = rf.predict_proba(X_valid)

print(roc_auc_score(y_valid, predict_proba[:, 1]))
# 0.9947632939680816

# Make prediction 

pred_test = rf.predict_proba(X_test)
pd.DataFrame({'ID' : X_test.index, 'satisfaction' : pred_test[:, 1]}).to_csv('000005.csv', index = False)
df = pd.read_csv('000005.csv')
df.head()

 

 

'Data Science > Classification' 카테고리의 다른 글

[Theorem] Bias vs Variance  (0) 2022.09.19
[Theorem] Validation Sets  (1) 2022.09.19
[Theorem] Regularization  (0) 2022.09.19
[Theorem] Overfitting  (0) 2022.09.19
[Theorem] Logistic Regression  (1) 2022.09.19