Note : This is just tiny subsets of full modeling workflow. We must understand domian knowledge of our training datasets and do statistical analysis firstly.
import pandas as pd
x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/airline/x_test.csv")
display(x_train.head())
display(y_train.head())
print(x_train.shape)
print(x_test.shape)
# Check missing values
print(x_train.info())
print(x_test.info())
# Copy dataset and set index as ID
train = x_train.copy().set_index('ID')
y = y_train.copy().set_index('ID').apply(lambda x: 1 if x['satisfaction'] == 'satisfied' else 0, axis = 1)
test = x_test.copy().set_index('ID')
# Drop column 'id'
train.drop(columns = 'id', inplace = True)
test.drop(columns = 'id', inplace = True)
# Fill missing values as median
train['Arrival Delay in Minutes'].fillna(train['Arrival Delay in Minutes'].median(), inplace = True)
test['Arrival Delay in Minutes'].fillna(test['Arrival Delay in Minutes'].median(), inplace = True)
# Check object variables
obj_cols = train.select_dtypes(include = ['object']).columns
for col in obj_cols :
print(f"{col} in Train")
print(train[col].value_counts())
print(train[col].unique())
print(train[col].nunique())
print('\n')
print(test[col].value_counts())
print(test[col].unique())
print(test[col].nunique())
print('\n')
# Check numerical variables
display(train.describe())
# Deal outlier
dep_q75 = train['Departure Delay in Minutes'].quantile(0.80)
arr_q75 = train['Arrival Delay in Minutes'].quantile(0.80)
train.loc[train['Departure Delay in Minutes'] > dep_q75, 'Departure Delay in Minutes'] = dep_q75
test.loc[test['Departure Delay in Minutes'] > dep_q75, 'Departure Delay in Minutes'] = dep_q75
train.loc[train['Arrival Delay in Minutes'] > arr_q75, 'Arrival Delay in Minutes'] = arr_q75
test.loc[test['Arrival Delay in Minutes'] > arr_q75, 'Arrival Delay in Minutes'] = arr_q75
# Preprocessing
num_cols = train.select_dtypes(exclude = ['object']).columns
obj_cols = train.select_dtypes(include = ['object']).columns
dct_cols = [col for col in num_cols if train[col].nunique() < 8]
cnt_cols = [col for col in num_cols if col not in dct_cols]
X = train[dct_cols + cnt_cols]
X_test = test[dct_cols + cnt_cols]
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X[cnt_cols] = ss.fit_transform(X[cnt_cols])
X_test[cnt_cols] = ss.transform(X_test[cnt_cols])
X_obj = pd.get_dummies(train[obj_cols])
X_test_obj = pd.get_dummies(test[obj_cols])
X = pd.concat([X, X_obj], axis = 1)
X_test = pd.concat([X_test, X_test_obj], axis = 1)
print(X.shape)
print(X_test.shape)
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
# Modeling
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train, y_train)
# Model Evaluation
from sklearn.metrics import roc_auc_score
predicts = rf.predict(X_valid)
predict_proba = rf.predict_proba(X_valid)
print(roc_auc_score(y_valid, predict_proba[:, 1]))
# 0.9947632939680816
# Make prediction
pred_test = rf.predict_proba(X_test)
pd.DataFrame({'ID' : X_test.index, 'satisfaction' : pred_test[:, 1]}).to_csv('000005.csv', index = False)
df = pd.read_csv('000005.csv')
df.head()
'Data Science > Classification' 카테고리의 다른 글
[Theorem] Bias vs Variance (0) | 2022.09.19 |
---|---|
[Theorem] Validation Sets (1) | 2022.09.19 |
[Theorem] Regularization (0) | 2022.09.19 |
[Theorem] Overfitting (0) | 2022.09.19 |
[Theorem] Logistic Regression (1) | 2022.09.19 |