본문 바로가기

Data Science/Regression

[Models] Regression Models

Note : This is just tiny subsets of full modeling workflow. We must understand domian knowledge of our training datasets and do statistical analysis firstly.

 

import pandas as pd

x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv")

# Check dataset 
print(x_train.head(3))
print(x_test.head(3)) 
print(y_train.head(3)) 
print(x_train.shape)
print(x_test.shape) 
print(y_train.shape)

# Missing values 
print(x_train.info())
print(x_test.info())

# Copy dataset and set 'ID' as index 
train = x_train.copy().set_index('StudentID') 
test = x_test.copy().set_index('StudentID') 
y = y_train.copy().set_index('StudentID') 

# Check object variables 
obj_cols = train.select_dtypes(include = ['object']).columns
for col in obj_cols : 
    print(train[col].value_counts().head())
    print(train[col].unique())
    print(train[col].nunique())
    print(test[col].value_counts().head())
    print(test[col].unique())
    print(test[col].nunique())

# Drop column 'G1'
train = train.drop(columns = 'G1')
test = test.drop(columns = 'G1')

# Check numerical variables 
print(train.describe())
print(train.corr())

# Process outliers 
num_cols = train.select_dtypes(exclude = ['object']).columns 
cnt_cols = [col for col in num_cols if train[col].nunique() < 10]
dct_cols = [col for col in num_cols if col not in cnt_cols]

for col in cnt_cols : 
    Q1 = train[col].quantile(0.25)
    Q3 = train[col].quantile(0.75)
    IQR = Q3 - Q1 
    upper = Q3 + 1.5 * IQR
    lower = Q1 - 1.5 * IQR 
    train.loc[train[col] > upper, col] = upper 
    train.loc[train[col] < lower, col] = lower 
    test.loc[test[col] > upper, col] = upper 
    test.loc[test[col] < lower, col] = lower 
    
# Preprocessing 
from sklearn.preprocessing import MinMaxScaler 
mm = MinMaxScaler() 

X = train[num_cols] 
X_test = test[num_cols]

X[num_cols] = mm.fit_transform(X[num_cols])
X_test[num_cols] = mm.transform(X_test[num_cols]) 

X_obj = pd.get_dummies(train[obj_cols]) 
X_test_obj = pd.get_dummies(test[obj_cols]) 

X = pd.concat([X, X_obj], axis = 1) 
X_test = pd.concat([X_test, X_test_obj], axis = 1) 
print(X.shape, X_test.shape)

# Train Test Split 
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2) 

# Modeling 
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
# xgb = XGBRegressor(random_state = 42) 
# params = {'max_depth' : [6, 8], 'n_estimators' : [200, 300], 'min_child_weight' : [4, 5]}
# grid = GridSearchCV(xgb, param_grid = params, cv = 4)
# grid.fit(X_train, y_train)
# print(grid.best_estimator_)

# Model Evaluation
from sklearn.metrics import r2_score
import numpy as np
best_xgb = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=4, missing=np.nan,
             monotone_constraints='()', n_estimators=200, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
best_xgb.fit(X_train, y_train)
preds = best_xgb.predict(X_valid) 
print(r2_score(y_valid, preds))

# Make submission 
pred = best_xgb.predict(X_test)
pd.DataFrame({'StudentID' : X_test.index, 'G3' : pred}).to_csv('0001.csv', index = False)
df = pd.read_csv('0001.csv')
print(df.head())

 

'Data Science > Regression' 카테고리의 다른 글

[Models] Underfitting and Overfitting  (0) 2022.09.20
[Models] How to make model  (0) 2022.09.20
[Theorem] Multivariate Linear Regression  (2) 2022.09.19
[Theorem] Linear Regression  (0) 2022.09.19