Note : This is just tiny subsets of full modeling workflow. We must understand domian knowledge of our training datasets and do statistical analysis firstly.
import pandas as pd
x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv")
# Check dataset
print(x_train.head(3))
print(x_test.head(3))
print(y_train.head(3))
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
# Missing values
print(x_train.info())
print(x_test.info())
# Copy dataset and set 'ID' as index
train = x_train.copy().set_index('StudentID')
test = x_test.copy().set_index('StudentID')
y = y_train.copy().set_index('StudentID')
# Check object variables
obj_cols = train.select_dtypes(include = ['object']).columns
for col in obj_cols :
print(train[col].value_counts().head())
print(train[col].unique())
print(train[col].nunique())
print(test[col].value_counts().head())
print(test[col].unique())
print(test[col].nunique())
# Drop column 'G1'
train = train.drop(columns = 'G1')
test = test.drop(columns = 'G1')
# Check numerical variables
print(train.describe())
print(train.corr())
# Process outliers
num_cols = train.select_dtypes(exclude = ['object']).columns
cnt_cols = [col for col in num_cols if train[col].nunique() < 10]
dct_cols = [col for col in num_cols if col not in cnt_cols]
for col in cnt_cols :
Q1 = train[col].quantile(0.25)
Q3 = train[col].quantile(0.75)
IQR = Q3 - Q1
upper = Q3 + 1.5 * IQR
lower = Q1 - 1.5 * IQR
train.loc[train[col] > upper, col] = upper
train.loc[train[col] < lower, col] = lower
test.loc[test[col] > upper, col] = upper
test.loc[test[col] < lower, col] = lower
# Preprocessing
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X = train[num_cols]
X_test = test[num_cols]
X[num_cols] = mm.fit_transform(X[num_cols])
X_test[num_cols] = mm.transform(X_test[num_cols])
X_obj = pd.get_dummies(train[obj_cols])
X_test_obj = pd.get_dummies(test[obj_cols])
X = pd.concat([X, X_obj], axis = 1)
X_test = pd.concat([X_test, X_test_obj], axis = 1)
print(X.shape, X_test.shape)
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
# Modeling
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
# xgb = XGBRegressor(random_state = 42)
# params = {'max_depth' : [6, 8], 'n_estimators' : [200, 300], 'min_child_weight' : [4, 5]}
# grid = GridSearchCV(xgb, param_grid = params, cv = 4)
# grid.fit(X_train, y_train)
# print(grid.best_estimator_)
# Model Evaluation
from sklearn.metrics import r2_score
import numpy as np
best_xgb = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=4, missing=np.nan,
monotone_constraints='()', n_estimators=200, n_jobs=12,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
best_xgb.fit(X_train, y_train)
preds = best_xgb.predict(X_valid)
print(r2_score(y_valid, preds))
# Make submission
pred = best_xgb.predict(X_test)
pd.DataFrame({'StudentID' : X_test.index, 'G3' : pred}).to_csv('0001.csv', index = False)
df = pd.read_csv('0001.csv')
print(df.head())
'Data Science > Regression' 카테고리의 다른 글
[Models] Underfitting and Overfitting (0) | 2022.09.20 |
---|---|
[Models] How to make model (0) | 2022.09.20 |
[Theorem] Multivariate Linear Regression (2) | 2022.09.19 |
[Theorem] Linear Regression (0) | 2022.09.19 |