Machine Learning ShootOut



#===============<0. Modules>========================================
import sklearn as sk
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score, r2_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, classification_report

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier, XGBRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.pipeline import make_pipeline





#==============================< Regressor >========================================


#==============================< Regressor Comparison >========================================
model_dt = DecisionTreeRegressor(random_state=2024)
model_rf = RandomForestRegressor(random_state=2024)
model_lr = LogisticRegression(random_state=2024)
model_xgb = XGBRegressor(random_state=2024)
#model_xgb = XGBRegressor(random_state=2024, tree_method='gpu_hist', gpu_id=0)
model_lgbm = LGBMRegressor(random_state=2024)
model_cat = CatBoostRegressor(random_state=2024)

# DecisionTree Regressor
model_dt.fit(X, y)
pred_dt = model_dt.predict(X_test)
mape_dt = mean_absolute_percentage_error(y_test, pred_dt)

# RandomForest Regressor
model_rf.fit(X, y)
pred_rf = model_rf.predict(X_test)
mape_rf = mean_absolute_percentage_error(y_test, pred_rf)

# Logistic Regression
model_lr.fit(X, y)
pred_lr = model_lr.predict(X_test)
mape_lr = mean_absolute_percentage_error(y_test, pred_lr)

# XGBoost Regression
model_xgb.fit(X, y)
pred_xgb = model_xgb.predict(X_test)
mape_xgb = mean_absolute_percentage_error(y_test, pred_xgb)

# LGBM Regressor
model_lgbm.fit(X, y)
pred_lgbm = model_lgbm.predict(X_test)
mape_lgbm = mean_absolute_percentage_error(y_test, pred_lgbm)

# CatBoost Regressor
model_cat.fit(X, y)
pred_cat = model_cat.predict(X_test)
mape_cat = mean_absolute_percentage_error(y_test, pred_cat)

print(f'[DecisionTree Regressor] Loss MAPE >> {mape_dt*100:.2f}%\n')
print(f'[RandomForest Regressor] Loss MAPE >> {mape_rf*100:.2f}%\n')
print(f'[Logistic Regression] Loss MAPE >> {mape_lr*100:.2f}%\n')
print(f'[XGBoost Regressor] Loss MAPE >> {mape_xgb*100:.2f}%\n')
print(f'[LGBM Regressor] Loss MAPE >> {mape_lgbm*100:.2f}%\n')
print(f'[CatBoost Regressor] Loss MAPE >> {mape_cat*100:.2f}%') 

#==============================< ML - Decision Tree Regressor - Grid Search >========================================
pipe_tree = make_pipeline(DecisionTreeRegressor(random_state=2024))

param_grid = {'decisiontreeregressor__max_depth': list(range(1, 30, 1)),
              'decisiontreeregressor__min_samples_leaf': list(range(1, 50, 5))}

gs = GridSearchCV(estimator = pipe_tree,
                 param_grid = param_grid,
                 scoring = 'neg_mean_absolute_error', 
                 cv = 10,
                 n_jobs= -1)

gs = gs.fit(X, y)
 
print(-gs.best_score_)
print(gs.best_params_)

model_dt_grid = gs.best_estimator_
model_dt_grid.fit(X, y)
 
pred_dt_G = model_dt_grid.predict(X_test)
mape_dt_G = mean_absolute_percentage_error(y_test, pred_dt_G)

print(f'[DecisionTree Regressor] [Grid Search] MAPE >> {mape_dt_G*100:.2f}%')

#==============================< ML - Random Forest Regressor - Grid Search >========================================
pipe_tree = make_pipeline(RandomForestRegressor(random_state=2024))

param_grid = { 'randomforestregressor__n_estimators': list(range(10, 50, 10)),
              'randomforestregressor__max_depth':  [None, 5, 10, 15, 20],
              'randomforestregressor__min_samples_split': [2, 5, 10],
              'randomforestregressor__min_samples_leaf': [1, 2, 4],
              'randomforestregressor__max_leaf_nodes': [None, 5, 10, 20],
              'randomforestregressor__bootstrap': [True, False]}
 
gs = GridSearchCV(estimator = pipe_tree,
                 param_grid = param_grid,
                 scoring = 'neg_mean_absolute_error',
                 n_jobs= -1)

gs = gs.fit(X, y)
 
print(-gs.best_score_)
print(gs.best_params_)

model_rf_grid = gs.best_estimator_
model_rf_grid.fit(X, y)
 
pred_rf_G = model_rf_grid.predict(X_test)
mape_rf_G = mean_absolute_percentage_error(y_test, pred_rf_G)

print(f'[RandomForest Regressor] [Grid Search] MAPE >> {mape_rf_G*100:.2f}%')

#==============================< ML - Logistic Regression - Grid Search >========================================
pipe_tree = make_pipeline(LogisticRegression(random_state=2024))

param_grid = {
    'logisticregression__C': np.logspace(-4, 4, 20),
    'logisticregression__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'logisticregression__l1_ratio': np.linspace(0, 1, 10)}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_lr_grid = gs.best_estimator_
model_lr_grid.fit(X, y)

pred_lr_G = model_lr_grid.predict(X_test)
mape_lr_G = mean_absolute_percentage_error(y_test, pred_lr_G)

print(f'[Logistic Regression] [Grid Search] MAPE >> {mape_lr_G*100:.2f}%')

#==============================< ML - XGBoost Regressor - Grid Search >========================================
pipe_tree = make_pipeline(XGBRegressor(random_state=2024))
#pipe_tree = make_pipeline(XGBRegressor(random_state=2024, tree_method='gpu_hist', gpu_id=0))

param_grid = {
    'xgbregressor__n_estimators': [50, 100, 200, 300],
    'xgbregressor__max_depth': [3, 5, 7],
    'xgbregressor__learning_rate': [0.1, 0.01, 0.05],
    'xgbregressor__subsample': [0.8, 0.9, 1.0],
    'xgbregressor__colsample_bytree': [0.8, 0.9, 1.0]}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_)
print(gs.best_params_)

model_xgb_grid = gs.best_estimator_
model_xgb_grid.fit(X, y)
 
pred_xgb_G = model_xgb_grid.predict(X_test)
mape_xgb_G = mean_absolute_percentage_error(y_test, pred_xgb_G)

print(f'[XGBoost Classifier] [Grid Search] MAPE >> {mape_xgb_G*100:.2f}%')

#==============================< ML - LGBM Regressor - Grid Search >========================================
pipe_tree = make_pipeline(LGBMRegressor(random_state=2024))

param_grid = {
    'lgbmregressor__learning_rate': [0.1, 0.15, 0.2],
    'lgbmregressor__max_depth': [1, 5, 10, 20, 30],
    'lgbmregressor__num_leaves': [10, 20, 30, 50, 100],
    'lgbmregressor__n_estimators': [50, 100, 200],
    'lgbmregressor__min_child_weight': [10, 20, 30],
    'lgbmregressor__colsample_bytree': [0.5, 0.7, 1.0]}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_)
print(gs.best_params_)

model_lgbm_grid = gs.best_estimator_
model_lgbm_grid.fit(X, y)
 
pred_lgbm_G = model_lgbm_grid.predict(X_test)
mape_lgbm_G = mean_absolute_percentage_error(y_test, pred_lgbm_G)

print(f'[LGBM Classifier] [Grid Search] MAPE >> {mape_lgbm_G*100:.2f}%')

#==============================< ML - Cat Boost Regressor - Grid Search >========================================
pipe_tree = make_pipeline(CatBoostRegressor(random_state=2024))

from sklearn.model_selection import GridSearchCV

param_grid = {
    'catboostregressor__depth': list(range(1,10,1)),
    'catboostregressor__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'catboostregressor__iterations': [100, 200, 300, 400, 500]}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_)
print(gs.best_params_)

model_cat_grid = gs.best_estimator_
model_cat_grid.fit(X, y)
 
pred_cat_G = model_cat_grid.predict(X_test)
mape_cat_G = mean_absolute_percentage_error(y_test, pred_cat_G)

#==============================< GridSearch Comparison >========================================
print(f'[DecisionTree Regressor] [GridSearch] Loss MAPE >> {mape_dt_G*100:.2f}%\n')
print(f'[RandomForest Regressor] [GridSearch] Loss MAPE >> {mape_rf_G*100:.2f}%\n')
print(f'[Logistic Regression] [GridSearch] Loss MAPE >> {mape_lr_G*100:.2f}%\n')
print(f'[XGBoost Regressor] [GridSearch] Loss MAPE >> {mape_xgb_G*100:.2f}%\n')
print(f'[LGBM Regressor] [GridSearch] Loss MAPE >> {mape_lgbm_G*100:.2f}%\n')
print(f'[CatBoost Regressor] [GridSearch] Loss MAPE >> {mape_cat_G*100:.2f}%') 





#==============================< Classifier >========================================


#==============================< Classifier Comparison >========================================
model_dt = DecisionTreeClassifier(random_state=2024)
model_rf = RandomForestClassifier(random_state=2024)
model_lr = LogisticRegression(random_state=2024)
model_xgb = XGBClassifier(random_state=2024)
#model_xgb = XGBClassifier(random_state=2024, tree_method='gpu_hist', gpu_id=0)
model_lgbm = LGBMClassifier(random_state=2024)
model_cat = CatBoostClassifier(random_state=2024)

# DecisionTree Classifier
model_dt.fit(X, y)
pred_dt = model_dt.predict(X_test)
acc_dt = accuracy_score(y_test, pred_dt)

# RandomForest Classifier
model_rf.fit(X, y)
pred_rf = model_rf.predict(X_test)
acc_rf = accuracy_score(y_test, pred_rf)

# Logistic Regression
model_lr.fit(X, y)
pred_lr = model_lr.predict(X_test)
acc_lr = accuracy_score(y_test, pred_lr)

# XGBoost Classifier
model_xgb.fit(X, y)
pred_xgb = model_xgb.predict(X_test)
acc_xgb = accuracy_score(y_test, pred_xgb)

# LGBM Classifier
model_lgbm.fit(X, y)
pred_lgbm = model_lgbm.predict(X_test)
acc_lgbm = accuracy_score(y_test, pred_lgbm)

# CatBoost Classifier
model_cat.fit(X, y)
pred_cat = model_cat.predict(X_test)
acc_cat = accuracy_score(y_test, pred_cat)

print(f'[DecisionTree Classifier]\n{classification_report(y_test, pred_dt)}\n')
print(f'[RandomForest Classifier]\n{classification_report(y_test, pred_rf)}\n')
print(f'[Logistic Regression]\n{classification_report(y_test, pred_lr)}\n')
print(f'[XGBoost Classifier]\n{classification_report(y_test, pred_xgb)}\n')
print(f'[LGBM Classifier]\n{classification_report(y_test, pred_lgbm)}\n')
print(f'[CatBoost Classifier]\n{classification_report(y_test, pred_cat)}')

print(f'[DecisionTree Classifier] Accuracy >> {acc_dt*100:.2f}%\n')
print(f'[RandomForest Classifier] Accuracy >> {acc_rf*100:.2f}%\n')
print(f'[Logistic Regression] Accuracy >> {acc_lr*100:.2f}%\n')
print(f'[XGBoost Classifier] Accuracy >> {acc_xgb*100:.2f}%\n')
print(f'[LGBM Classifier] Accuracy >> {acc_lgbm*100:.2f}%\n')
print(f'[CatBoost Classifier] Accuracy >> {acc_cat*100:.2f}%')  

#==============================< ML - Decision Tree Classifier - Grid Search >========================================
pipe_tree = make_pipeline(DecisionTreeClassifier(random_state=2024))

param_grid = {'decisiontreeclassifier__criterion': ['gini', 'entropy'],
              'decisiontreeclassifier__splitter': ['best', 'random'],
              'decisiontreeclassifier__max_depth': [None, 10, 20, 30],
              'decisiontreeclassifier__min_samples_split': [2, 5, 10],
              'decisiontreeclassifier__min_samples_leaf': [1, 2, 4]}
 
gs = GridSearchCV(estimator = pipe_tree,
                 param_grid = param_grid,
                 scoring = 'neg_mean_absolute_error', 
                 cv = 10,
                 n_jobs= -1) 

#gs = gs.fit(X_scaled, y)
gs = gs.fit(X, y)
 
print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_dt_grid = gs.best_estimator_
model_dt_grid.fit(X, y)
 
pred_dt_G = model_dt_grid.predict(X_test)
acc_dt_G = accuracy_score(y_test, pred_dt_G)

print(f'[DecisionTree Classifier] [Grid Search] Accuracy >> {acc_dt_G*100:.2f}%')

#==============================< ML - RandomForest Classifier - Grid Search >========================================
pipe_tree = make_pipeline(RandomForestClassifier(random_state=2024))

param_grid = {'randomforestclassifier__n_estimators': [50, 100, 200],
              'randomforestclassifier__criterion': ['gini', 'entropy'],
              'randomforestclassifier__max_depth': [None, 10, 20, 30],
              'randomforestclassifier__min_samples_split': [2, 5, 10],
              'randomforestclassifier__min_samples_leaf': [1, 2, 3, 4, 5],
              'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2']}
 
gs = GridSearchCV(estimator = pipe_tree,
                 param_grid = param_grid,
                 scoring = 'neg_mean_absolute_error',
                 cv = 10,
                 n_jobs= -1)

gs = gs.fit(X, y)
 
print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_rf_grid = gs.best_estimator_
model_rf_grid.fit(X, y)
 
pred_rf_G = model_rf_grid.predict(X_test)
acc_rf_G = accuracy_score(y_test, pred_rf_G)

print(f'[RandomForest Classifier] [Grid Search] Accuracy >> {acc_rf_G*100:.2f}%')

#==============================< ML - Logistic Regressor - Grid Search >========================================
pipe_tree = make_pipeline(LogisticRegression(random_state=2024))

param_grid = {
    'logisticregression__C': np.logspace(-4, 4, 20),
    'logisticregression__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'logisticregression__l1_ratio': np.linspace(0, 1, 10)}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_lr_grid = gs.best_estimator_
model_lr_grid.fit(X, y)
 
pred_lr_G = model_lr_grid.predict(X_test)
acc_lr_G = accuracy_score(y_test, pred_lr_G)

print(f'[Logistic Regression] [Grid Search] Accuracy >> {acc_lr_G*100:.2f}%')

#==============================< ML - XGB Classifier - Grid Search >========================================
pipe_tree = make_pipeline(XGBClassifier(random_state=2024))
#pipe_tree = make_pipeline(XGBClassifier(random_state=2024, tree_method='gpu_hist', gpu_id=0))

param_grid = {
    'xgbclassifier__n_estimators': [50, 100, 200, 300],
    'xgbclassifier__max_depth': [3, 5, 7], 
    'xgbclassifier__learning_rate': [0.1, 0.01, 0.05],
    'xgbclassifier__subsample': [0.8, 0.9, 1.0], 
    'xgbclassifier__colsample_bytree': [0.8, 0.9, 1.0]}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_xgb_grid = gs.best_estimator_
model_xgb_grid.fit(X, y)
 
pred_xgb_G = model_xgb_grid.predict(X_test)
acc_xgb_G = accuracy_score(y_test, pred_xgb_G)

print(f'[XGBoost Classifier] [Grid Search] Accuracy >> {acc_xgb_G*100:.2f}%')

#==============================< ML - LGBM Classifier - Grid Search >========================================
pipe_tree = make_pipeline(LGBMClassifier(random_state=2024))

param_grid = {
    'lgbmclassifier__learning_rate': [0.01, 0.05, 0.1],
    'lgbmclassifier__n_estimators': [50, 100, 200],
    'lgbmclassifier__num_leaves': [20, 30, 40],
    'lgbmclassifier__max_depth': [-1, 10, 20, 30],
    'lgbmclassifier__min_child_samples': [1, 5, 10],
    'lgbmclassifier__subsample': [0.8, 1.0],
    'lgbmclassifier__colsample_bytree': [0.8, 1.0]}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_lgbm_grid = gs.best_estimator_
model_lgbm_grid.fit(X, y)
 
pred_lgbm_G = model_lgbm_grid.predict(X_test)
acc_lgbm_G = accuracy_score(y_test, pred_lgbm_G)

print(f'[LGBM Classifier] [Grid Search] Accuracy >> {acc_lgbm_G*100:.2f}%')

#==============================< ML - CatBoost Classifier - Grid Search >========================================
pipe_tree = make_pipeline(CatBoostClassifier(random_state=2024))

param_grid = {
    'catboostclassifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'catboostclassifier__iterations': [50, 100, 200],
    'catboostclassifier__depth': [4, 6, 8, 10],
    'catboostclassifier__l2_leaf_reg': [1, 3, 5, 7, 9],
    'catboostclassifier__border_count': [32, 64, 128]}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_)
print(gs.best_params_)

model_cat_grid = gs.best_estimator_
model_cat_grid.fit(X, y)
 
pred_cat_G = model_cat_grid.predict(X_test)
acc_cat_G = accuracy_score(y_test, pred_cat_G)

print(f'[CatBoost Classifier] [Grid Search] Accuracy >> {acc_cat_G*100:.2f}%')

#==============================< GridSearch Comparison >========================================
print(f'[DecisionTree Classifier] [GridSearch] Accuracy >> {acc_dt_G*100:.2f}%\n')
print(f'[RandomForest Classifier] [GridSearch] Accuracy >> {acc_rf_G*100:.2f}%\n')
print(f'[Logistic Regression] [GridSearch] Accuracy >> {acc_lr_G*100:.2f}%\n')
print(f'[XGBoost Classifier] [GridSearch] Accuracy >> {acc_xg_G*100:.2f}%\n')
print(f'[LGBM Classifier] [GridSearch] Accuracy >> {acc_lgbm_G*100:.2f}%\n')
print(f'[CatBoost Classifier] [GridSearch] Accuracy >> {acc_cat_G*100:.2f}%')  


    

Comments

Popular posts from this blog

[Kaggle] Titanic Survivor Classification

[Kaggle] Pizza or Not Classification (Computer Vision)