[Kaggle] Titanic Survivor Classification

Titanic Survivor Classification Challenge From Kaggle.




"""
<< Index >>
- 0. Modules
- 1. Train Data Load
- 2. Null Data
- 3. Outliers & One-Hot Encoding
  - 3.1. Outliers
  - 3.2. One-Hot-Encoding 
  - 3.3. Merge DF
- 4. Correlation Analysis
  - 4.1. Correlation Check (include dummies)
  - 4.2. Get Original Categorical Column Names
  - 4.3. Handle Categorical Columns Using Corr (del & dummy)
- 5. Data Split-1 [Data and Label]
- 6. Scaling
- 7. Data Split-2 [Train and Validation]
- 8. Test Data Load
- 9. Machine Learning
  - 9.0. Comparison
  - 9.1. ML - Decision Tree Classifier - Grid Search
  - 9.2. ML - Random Forest Classifier - Grid Search
  - 9.3. ML - Logistic Regressor - Grid Search
  - 9.4. ML - XGBoost Classifier - Grid Search
  - 9.5. ML - LGBM Classifier - Grid Search
  - 9.6. ML - CatBoost Classifier - Grid Search
- 10. Deep Learning
  - 10.1. Network Model
  - 10.2. Test Score
- 11. Final Score Comparison
- 12. Submit
"""

#========================================<0. Modules>========================================
import sklearn as sk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score, r2_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, classification_report

from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool2D, Embedding, Bidirectional, LSTM, SimpleRNN, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier, XGBRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.pipeline import make_pipeline

import time
import os
import shutil
import math
import copy
from tqdm.notebook import tqdm
import pickle

#========================================<1. Train Data Load>========================================
train_path = "./Data/Final/titanic/train.csv"
df_train= pd.read_csv(train_path)
print(f"trainset shape >> {df_train.shape}")

df_train.head()

df_train = df_train.drop(columns='PassengerId')

df_train.info()

#========================================<2. Null Data>========================================
# Devide columns list
columns_all = list(df_train.columns)
print(f"Total Columns  ({len(columns_all)} columns)\n  >> {columns_all}\n")

columns_num = []
columns_cat = []

for column in columns_all:
    if df_train[column].dtype == "object" or df_train[column].dtype == "bool":
        columns_cat.append(column)
    else:
        columns_num.append(column)
        
print(f"Categorical Columns  ({len(columns_cat)} columns)\n  >> {columns_cat}\n")
print(f"Numeric Columns  ({len(columns_num)} columns)\n  >> {columns_num}\n")

# Categorical column data
for column in columns_cat:
    print(f"<< {column} >>")
    print(df_train[column].value_counts())
    print('\n'*2)

# Count null
print(df_train.isnull().sum())
print()
print(df_train.isnull().sum().sum())

# Print null ratio 
too_many_null_columns = []

for column in columns_all:
    null_ratio = df_train[column].isnull().sum()/len(df_train[column])
    
    if null_ratio!= 0:
        if null_ratio>0.1:
            too_many_null_columns.append(column)
            
        print(f"<< {column} - Null ratio >>")
        print(f"{null_ratio*100:.2f}%")

print(f"\nColumns to delete because of Null ({len(too_many_null_columns)} columns) \n  >> {too_many_null_columns}")

# Replace null data (Numeric)
df_null = df_train

for column in columns_num:
    # numeric -> median
    Median = df_train[column].median()
    df_null[column].fillna(Median, inplace=True)
    
for column in columns_cat:
    pass
    
# Drop columns contain too many null (Categorical)
df_null = df_train.drop(columns=too_many_null_columns)

df_null = df_null.dropna()
df_null.reset_index(drop=True, inplace=True)

# Check null amount again
print(df_null.isnull().sum())
print()
print(df_null.isnull().sum().sum())

df_null.info()

#========================================<3. Outliers & One-Hot Encoding>========================================

#========================================<3.1. Outliers>========================================
columns_all2 = list(df_null.columns)
print(f"Total Columns  ({len(columns_all2)} columns)\n  >> {columns_all2}\n")

columns_num2 = []
columns_cat2 = []

for column in columns_all2:
    if df_train[column].dtype == "object" or df_train[column].dtype == "bool":
        columns_cat2.append(column)
    else:
        columns_num2.append(column)
        
print(f"Categorical Columns  ({len(columns_cat2)} columns)\n  >> {columns_cat2}\n")
print(f"Numeric Columns  ({len(columns_num2)} columns)\n  >> {columns_num2}\n")

# Devide dataframe with data type
df_cat = df_null[columns_cat2]
df_num = df_null[columns_num2]

# Print numeric data graph
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

# Handle outliers
print("<< Outliers >>\n")
target_column = 'SalePrice'

for column in columns_num2:
    q25, q75 = np.quantile(df_num[column], 0.25), np.quantile(df_num[column], 0.75)
    iqr = q75 - q25

    inner_fence = iqr * 1.5
    outer_fence = iqr * 3
    
    lower_limit, upper_limit = q25 - outer_fence, q75 + outer_fence

    lower_outlier_cnt = len(df_num[df_num[column]upper_limit])
    
    total_outlier_cnt = lower_outlier_cnt + upper_outlier_cnt
    
    ratio = total_outlier_cnt / len(df_num[column]) * 100

    median = np.median(df_num[column])
    
    if column != target_column and total_outlier_cnt != 0:
        print(f"{column} | Low:{lower_outlier_cnt}ea | Upper:{upper_outlier_cnt}ea | Total:{total_outlier_cnt}ea | Ratio:{ratio:.2f}%")
        df_num[df_num[column]>upper_limit] = median

#========================================<3.2. One-Hot-Encoding>========================================
# One-Hot-Encoding categorical DF
df_cat_encoded = pd.get_dummies(df_cat, columns=columns_cat2)

# Check Info
df_cat_encoded.info()

#========================================<3.3. MErge DF>========================================
# DF Merge
df_merge = pd.concat([df_cat_encoded, df_num],axis=1)

# Print columns
columns_all2 = list(df_merge.columns)
print(columns_all2, '\n')
print(f"DF_merge >> {len(columns_all2)} columns")

df_merge = pd.concat([df_cat_encoded, df_num],axis=1)

#========================================<4. Correlation Analysis>========================================

#========================================<4.1. Correlation Check (include dummies)>========================================

corr_data = df_merge.corr()

corr_columns_to_delete = []

# Print low corr columns list
print("<< Column under abs 0.1 in correlation >>\n")

for idx in range(1, len(columns_all2)):
    corr = corr_data["Survived"][idx]
    column = columns_all2[idx]
    if abs(corr)<0.1:
        print(f"[{column}] >> {corr}")
        corr_columns_to_delete.append(column)

print(f'Total Dummy Columns To Delete  \n  >> {corr_columns_to_delete}\n')
print(f"Count >> {len(corr_columns_to_delete)} columns")

#========================================<4.2. Get Original Categorical Column Names>========================================
corr_del_columns_origin = []

# Include dummies
for column in corr_columns_to_delete:
    for idx in range(len(column)):
        if column[idx]=='_':
            break
    # if there is no '_' in loop, add 1 to get whole name
    if idx+1 == len(column):
        idx += 1
        
    origin_column = column[:idx]

    if origin_column not in corr_del_columns_origin:
        corr_del_columns_origin.append(origin_column)

# Print columns count
print(f"Total Dummy Included DF >> {len(columns_all2)} columns")
print(f"Original Categorical DF >> {len(columns_cat2)} columns")
print()
print(f"Total Columns Cnt To Delete(+Numeric) >> {len(corr_del_columns_origin)} columns")

#========================================<4.3. Handle Categorical Columns Using Corr>========================================
# Drop columns with original names
df_corr = df_null.drop(columns=corr_del_columns_origin)

# Check the survived categorical columns
columns_cat_left = []
for column in list(df_corr.columns):
    if df_corr[column].dtype == "object" or df_corr[column].dtype == "bool":
        columns_cat_left.append(column)
        
print(f"Survived Categorical Columns  ({len(columns_cat_left)} columns)\n  >> {columns_cat_left}\n")

# Make dummies with survived columns
df_corr = pd.get_dummies(df_corr, columns=columns_cat_left)

# Boolean Data Replace
df_corr.replace(True, 1, inplace=True)
df_corr.replace(False, 0, inplace=True)

# Print Info
df_corr

#========================================<5. Data Split-1 [Data and Label]>========================================
# Split X, y
X = df_corr.drop(['Survived'], axis='columns')
y = df_corr['Survived'].reset_index(drop=True)

print(f"X type >> {type(X)}")
print(f"y type >> {type(y)}")

print(f'X Shape >> {X.shape}')
print(f'y Shape >> {y.shape}')

#========================================<6. Scaling>========================================
Scaler = StandardScaler()

scaler_fit_X = Scaler.fit(X)
X_scaled = scaler_fit_X.transform(X)

#inverse_scaled_data = Scaler.inverse_transform(scaled_data)

#========================================<7. Data Split-2 [Train and Validation]>========================================
# Scaled
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, 
                                                      test_size=0.1, 
                                                      random_state=24)

print(f'Train Set Shape >> X_train {X_train.shape}  |  y_train {y_train.shape}')
print(f'Valid Set Shape >> X_valid {X_valid.shape}  |  y_valid {y_valid.shape}')

#========================================<8. Test Data Load>========================================
test_Data_Path  = "./Data/Final/titanic/test.csv"
test_Label_Path = "./Data/Final/titanic/test_answer.csv"

X_test = pd.read_csv(test_Data_Path)
y_test = pd.read_csv(test_Label_Path)

print(f"Test Set Shape >> {X_test.shape}")

# Drop usless column same as train data
X_test = X_test.drop(columns='PassengerId')

# Drop too much null columns same as train data
X_test = X_test.drop(columns=too_many_null_columns)

# Print null amount
print(X_test.isnull().sum())
print()
print(X_test.isnull().sum().sum())

columns_all3 = list(X_test.columns)
print(f"Test Total Columns  ({len(columns_all3)} columns)\n  >> {columns_all3}\n")

columns_cat3 = []
columns_num3 = []

for column in columns_all3:
    if X_test[column].dtype == "object" or X_test[column].dtype == "bool":
        columns_cat3.append(column)
    else:
        columns_num3.append(column)
        
print(f"Test Categorical Columns  ({len(columns_cat3)} columns)\n  >> {columns_cat3}\n")
print(f"Test Numeric Columns  ({len(columns_num3)} columns)\n  >> {columns_num3}\n")

# Null replace by data types
for column in columns_num3:
    Median = X_test[column].median()
    X_test[column].fillna(Median, inplace=True)
    
for column in columns_cat3:
    Mode = X_test[column].mode()[0]
    X_test[column].fillna(Mode, inplace=True)

print(X_test.isnull().sum())
print()
print(X_test.isnull().sum().sum())

# Drop columns used corr same as train data
X_test = X_test.drop(columns=corr_del_columns_origin)

# Make survived categorical column to dummy
X_test = pd.get_dummies(X_test, columns=columns_cat_left)

# Boolean replace
X_test.replace(True, 1, inplace=True)
X_test.replace(False, 0, inplace=True)

X_test

X_test.info()

y_test = y_test['Survived']

scaler_fit_X = Scaler.fit(X_test)
X_test_scaled = scaler_fit_X.transform(X_test)

print(f'X_test Shape >> {X_test.shape}')
print(f'y_test Shape >> {y_test.shape}')

#========================================<9. Machine Learning>========================================

#========================================<9.0. Comparison>========================================
model_dt = DecisionTreeClassifier(random_state=2024)
model_rf = RandomForestClassifier(random_state=2024)
model_lr = LogisticRegression(random_state=2024)
model_xgb = XGBClassifier(random_state=2024)
#model_xgb = XGBClassifier(random_state=2024, tree_method='gpu_hist', gpu_id=0)
model_lgbm = LGBMClassifier(random_state=2024)
model_cat = CatBoostClassifier(random_state=2024)

# DecisionTree Classifier
model_dt.fit(X, y)
pred_dt = model_dt.predict(X_test)
acc_dt = accuracy_score(y_test, pred_dt)

# RandomForest Classifier
model_rf.fit(X, y)
pred_rf = model_rf.predict(X_test)
acc_rf = accuracy_score(y_test, pred_rf)

# Logistic Regression
model_lr.fit(X, y)
pred_lr = model_lr.predict(X_test)
acc_lr = accuracy_score(y_test, pred_lr)

# XGBoost Classifier
model_xgb.fit(X, y)
pred_xgb = model_xgb.predict(X_test)
acc_xgb = accuracy_score(y_test, pred_xgb)

# LGBM Classifier
model_lgbm.fit(X, y)
pred_lgbm = model_lgbm.predict(X_test)
acc_lgbm = accuracy_score(y_test, pred_lgbm)

# CatBoost Classifier
model_cat.fit(X, y)
pred_cat = model_cat.predict(X_test)
acc_cat = accuracy_score(y_test, pred_cat)

# Matrix
print(f'[DecisionTree Classifier]\n{classification_report(y_test, pred_dt)}\n')
print(f'[RandomForest Classifier]\n{classification_report(y_test, pred_rf)}\n')
print(f'[Logistic Regression]\n{classification_report(y_test, pred_lr)}\n')
print(f'[XGBoost Classifier]\n{classification_report(y_test, pred_xgb)}\n')
print(f'[LGBM Classifier]\n{classification_report(y_test, pred_lgbm)}\n')
print(f'[CatBoost Classifier]\n{classification_report(y_test, pred_cat)}')

# Score
print(f'[DecisionTree Classifier] Accuracy >> {acc_dt*100:.2f}%\n')
print(f'[RandomForest Classifier] Accuracy >> {acc_rf*100:.2f}%\n')
print(f'[Logistic Regression] Accuracy >> {acc_lr*100:.2f}%\n')
print(f'[XGBoost Classifier] Accuracy >> {acc_xgb*100:.2f}%\n')
print(f'[LGBM Classifier] Accuracy >> {acc_lgbm*100:.2f}%\n')
print(f'[CatBoost Classifier] Accuracy >> {acc_cat*100:.2f}%')  

#========================================<9.1. ML - Decision Tree Classifier - Grid Search>========================================
pipe_tree = make_pipeline(DecisionTreeClassifier(random_state=2024))

param_grid = {'decisiontreeclassifier__criterion': ['gini', 'entropy'],
              'decisiontreeclassifier__splitter': ['best', 'random'],
              'decisiontreeclassifier__max_depth': [None, 10, 20, 30],
              'decisiontreeclassifier__min_samples_split': [2, 5, 10],
              'decisiontreeclassifier__min_samples_leaf': [1, 2, 4]}
 
gs = GridSearchCV(estimator = pipe_tree,
                 param_grid = param_grid,
                 scoring = 'neg_mean_absolute_error', 
                 cv = 10,
                 n_jobs= -1) 

#gs = gs.fit(X_scaled, y)
gs = gs.fit(X, y)
 
print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_dt_grid = gs.best_estimator_
model_dt_grid.fit(X, y)
 
pred_dt_G = model_dt_grid.predict(X_test)
acc_dt_G = accuracy_score(y_test, pred_dt_G)

# Score
print(f'[DecisionTree Classifier] [Grid Search] Accuracy >> {acc_dt_G*100:.2f}%')

#========================================<9.2. ML - Random Forest Classifier - Grid Search>========================================
pipe_tree = make_pipeline(RandomForestClassifier(random_state=2024))

param_grid = {'randomforestclassifier__n_estimators': [50, 100, 200],
              'randomforestclassifier__criterion': ['gini', 'entropy'],
              'randomforestclassifier__max_depth': [None, 10, 20, 30],
              'randomforestclassifier__min_samples_split': [2, 5, 10],
              'randomforestclassifier__min_samples_leaf': [1, 2, 3, 4, 5],
              'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2']}
 
gs = GridSearchCV(estimator = pipe_tree,
                 param_grid = param_grid,
                 scoring = 'neg_mean_absolute_error',
                 cv = 10,
                 n_jobs= -1)

gs = gs.fit(X, y)
 
print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

"""
model_rf_grid = gs.best_estimator_
model_rf_grid.fit(X, y)

pred_rf_G = model_rf_grid.predict(X_test)
acc_rf_G = accuracy_score(y_test, pred_rf_G)

# Score
print(f'[RandomForest Classifier] [Grid Search] Accuracy >> {acc_rf_G*100:.2f}%')
"""

#========================================<9.3. ML - Logistic Regressor - Grid Search>========================================
pipe_tree = make_pipeline(LogisticRegression(random_state=2024))

param_grid = {
    'logisticregression__C': np.logspace(-4, 4, 20),
    'logisticregression__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'logisticregression__l1_ratio': np.linspace(0, 1, 10)}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_lr_grid = gs.best_estimator_
model_lr_grid.fit(X, y)
 
pred_lr_G = model_lr_grid.predict(X_test)
acc_lr_G = accuracy_score(y_test, pred_lr_G)

# Score
print(f'[Logistic Regression] [Grid Search] Accuracy >> {acc_lr_G*100:.2f}%')

#========================================<9.4. ML - XGBoost Classifier - Grid Search>========================================
pipe_tree = make_pipeline(XGBClassifier(random_state=2024))
#pipe_tree = make_pipeline(XGBClassifier(random_state=2024, tree_method='gpu_hist', gpu_id=0))

param_grid = {
    'xgbclassifier__n_estimators': [50, 100, 200, 300],
    'xgbclassifier__max_depth': [3, 5, 7], 
    'xgbclassifier__learning_rate': [0.1, 0.01, 0.05],
    'xgbclassifier__subsample': [0.8, 0.9, 1.0], 
    'xgbclassifier__colsample_bytree': [0.8, 0.9, 1.0]}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_xgb_grid = gs.best_estimator_
model_xgb_grid.fit(X, y)
 
pred_xgb_G = model_xgb_grid.predict(X_test)
acc_xgb_G = accuracy_score(y_test, pred_xgb_G)

# Score
print(f'[XGBoost Classifier] [Grid Search] Accuracy >> {acc_xgb_G*100:.2f}%')

#========================================<9.5. ML - LGBM Classifier - Grid Search>========================================
pipe_tree = make_pipeline(LGBMClassifier(random_state=2024))

param_grid = {
    'lgbmclassifier__learning_rate': [0.01, 0.05, 0.1],
    'lgbmclassifier__n_estimators': [50, 100, 200],
    'lgbmclassifier__num_leaves': [20, 30, 40],
    'lgbmclassifier__max_depth': [-1, 10, 20, 30],
    'lgbmclassifier__min_child_samples': [1, 5, 10],
    'lgbmclassifier__subsample': [0.8, 1.0],
    'lgbmclassifier__colsample_bytree': [0.8, 1.0]}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_lgbm_grid = gs.best_estimator_
model_lgbm_grid.fit(X, y)
 
pred_lgbm_G = model_lgbm_grid.predict(X_test)
acc_lgbm_G = accuracy_score(y_test, pred_lgbm_G)

# Score
print(f'[LGBM Classifier] [Grid Search] Accuracy >> {acc_lgbm_G*100:.2f}%')

#========================================<9.6. ML - CatBoost Classifier - Grid Search>========================================
pipe_tree = make_pipeline(CatBoostClassifier(random_state=2024))

param_grid = {
    'catboostclassifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'catboostclassifier__iterations': [50, 100, 200],
    'catboostclassifier__depth': [4, 6, 8, 10],
    'catboostclassifier__l2_leaf_reg': [1, 3, 5, 7, 9],
    'catboostclassifier__border_count': [32, 64, 128]}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_)
print(gs.best_params_)

model_cat_grid = gs.best_estimator_
model_cat_grid.fit(X, y)
 
pred_cat_G = model_cat_grid.predict(X_test)
acc_cat_G = accuracy_score(y_test, pred_cat_G)

# Score
print(f'[CatBoost Classifier] [Grid Search] Accuracy >> {acc_cat_G*100:.2f}%')

#========================================<※ GridSearch Comparison>========================================
print(f'[DecisionTree Classifier] [GridSearch] Accuracy >> {acc_dt_G*100:.2f}%\n')
print(f'[RandomForest Classifier] [GridSearch] Accuracy >> {acc_rf_G*100:.2f}%\n')
print(f'[Logistic Regression] [GridSearch] Accuracy >> {acc_lr_G*100:.2f}%\n')
print(f'[XGBoost Classifier] [GridSearch] Accuracy >> {acc_xg_G*100:.2f}%\n')
print(f'[LGBM Classifier] [GridSearch] Accuracy >> {acc_lgbm_G*100:.2f}%\n')
print(f'[CatBoost Classifier] [GridSearch] Accuracy >> {acc_cat_G*100:.2f}%')  

#========================================<10. Deep Learning>========================================

#========================================<10.1. Network Model>========================================
tf.random.set_seed(2024)

model_DL = Sequential()
model_DL.add(Dense(units = 32, input_shape = (X_train.shape[1],), activation = 'leaky_relu'))
model_DL.add(Dense(units = 64, activation = 'leaky_relu', kernel_initializer = 'he_normal'))
model_DL.add(Dense(units = 128, activation = 'leaky_relu', kernel_initializer = 'he_normal'))
model_DL.add(Dropout(0.1))
model_DL.add(Dense(units = 64, activation = 'leaky_relu', kernel_initializer = 'he_normal'))
model_DL.add(Dropout(0.1))
model_DL.add(Dense(units = 32, activation = 'leaky_relu', kernel_initializer = 'he_normal'))
model_DL.add(Dropout(0.1))
model_DL.add(Dense(units = 16, activation = 'leaky_relu', kernel_initializer = 'he_normal'))
model_DL.add(Dense(units = 8, activation = 'leaky_relu', kernel_initializer = 'he_normal'))
model_DL.add(Dense(units = 1 , activation = 'sigmoid'))

# Compile
model_DL.compile (loss = 'binary_crossentropy',
                  optimizer = tf.keras.optimizers.Nadam(0.0001),
                  metrics = 'accuracy')

model_DL.summary()

# EarlyStopping
es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

# ModelCheckpoint
checkpoint_path = 'tmp_checkpoint.ckpt'

cp = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True)

# Fit
with tf.device('/GPU:0'):
    history = model_DL.fit(X_train, y_train, 
                        epochs=70, 
                        batch_size=64, 
                        validation_data=(X_valid, y_valid), 
                        verbose =1, 
                        callbacks=[es, cp])

# Learning accuracy graph
epochs = range(1, len(history.history['accuracy']) + 1)
plt.plot(epochs, history.history['accuracy'])
plt.plot(epochs, history.history['val_accuracy'])
plt.title('model accuracy')
plt.text(10, 0.5, f"Max Val Acc : {max(history.history['accuracy']):.2f}", fontsize=12, color='Green')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], )
plt.show()

# Learning loss graph
epochs = range(1, len(history.history['loss']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('Learning Loss')
plt.text(10, 0.8, f"Min Val Loss : {min(history.history['val_loss']):.2f}", fontsize=12, color='Red')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], )
plt.show()

#========================================<10.2. Test Score>========================================
try:
    X_test_scaled = X_test_scaled.to_numpy()
    y_test = y_test.to_numpy()
    
except:
    print('Pass!')
    pass

with tf.device('/GPU:0'):
    test_mse, test_mae = model_DL.evaluate(X_test_scaled, y_test)

"""
print()
print(f'[Deep Learning] MSE >> {test_mse}\n')
print(f'[Deep Learning] MAE >> {test_mae}')
"""

# Binarization
pred_DL[pred_DL<0.5]=0
pred_DL[pred_DL>=0.5]=1
pred_DL = pred_DL.astype('int64')

acc_DL = accuracy_score(y_test, pred_DL)

print(f'[Deep Learning] Accuracy >> {acc_DL*100:.2f}%')

#========================================<11. Final Score Comparison>========================================
print(f'[DecisionTree Classifier] Accuracy >> {acc_dt*100:.2f}%\n')
print(f'[RandomForest Classifier] Accuracy >> {acc_rf*100:.2f}%\n')
print(f'[Logistic Regression] Accuracy >> {acc_lr*100:.2f}%\n')
print(f'[XGBoost Classifier] Accuracy >> {acc_xgb*100:.2f}%\n')
print(f'[LGBM Classifier] Accuracy >> {acc_lgbm*100:.2f}%\n')
print(f'[CatBoost Classifier] Accuracy >> {acc_cat*100:.2f}%\n') 

#print(f'[Logistic Regression] [GridSearch] Accuracy >> {acc_lr_G*100:.2f}%\n\n')

print(f'\n[Deep Learning Classifier] Accuracy >> {acc_DL*100:.2f}%') 

# Select best model
best_model = model_lr_grid

# Machine Learning
with open('best_model.pickle','wb') as fw:
    pickle.dump(best_model, fw)

with open('best_model.pickle','rb') as f:
    best_model = pickle.load(f)

# Deep Learning
"""
best_model.save('best_model.h5')
best_model = tf.keras.models.load_model('best_model.h5')
"""

# Predict using best_model
y_pred_final = best_model.predict(X_test)
#y_pred_final = best_model.predict(X_test_scaled)

# Result Check
#y_pred_final[:5]

# Final score
acc_final = accuracy_score(y_test, y_pred_final)
print(f'[Final Model] Accuracy >> {acc_final*100:.2f}%') 

#========================================<12. Submit>========================================
test_Data_Path  = "./Data/Final/titanic/test.csv"
df_final = pd.read_csv(test_Data_Path)

df_final.insert(0, 'Predict', y_pred_final)

# Result Check
df_final.head()

# Save Result Data
df_final.to_csv('DF_Final.csv')

Comments

Popular posts from this blog

[Kaggle] Pizza or Not Classification (Computer Vision)

Machine Learning ShootOut