[Kaggle] Boston House Price Regrresion




"""
<< Index >>
- 0. Modules
- 1. Train Data Load
- 2. Null Data
- 3. Outliers & One-Hot Encoding
- 4. Correlation Analysis
- 5. Data Split-1 [Data and Label]
- 6. Scaling
- 7. Data Split-2 [Train and Validation]
- 8. Test Data Load
- 9. Machine Learning
  - 9.0. Comparison
  - 9.1. ML - Decision Tree Regressor - Grid Search
  - 9.2. ML - Random Forest Regressor - Grid Search
  - 9.3. ML - Logistic Regression - Grid Search
  - 9.4. ML - LGBM Regressor - Grid Search
  - 9.5. ML - CatBoost Regressor - Grid Search
- 10. Deep Learning
- 11. Final Score
- 12. Submit
"""


#========================================<0. Modules>========================================
import sklearn as sk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score, r2_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, classification_report

from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool2D, Embedding, Bidirectional, LSTM, SimpleRNN, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.pipeline import make_pipeline

import time
import os
import shutil
import math
import copy
from tqdm.notebook import tqdm
import pickle

#========================================<1. Train Data Load>========================================
# Load Train data
train_path = "./Data/Final/house_price/train.csv"
df_train= pd.read_csv(train_path)
print(f"trainset shape >> {df_train.shape}")

# Data check
df_train.head()

# Drop usless column
df_train = df_train.drop(columns='Id')

# Data check
df_train.info()

#========================================<2. Null Data>========================================
# Columns and Type
columns_all = list(df_train.columns)
print(f"Total Columns  ({len(columns_all)} columns)\n  >> {columns_all}\n")

columns_num = []
columns_cat = []

for column in columns_all:
    if df_train[column].dtype == "object" or df_train[column].dtype == "bool":
        columns_cat.append(column)
    else:
        columns_num.append(column)
        
print(f"Categorical Columns  ({len(columns_cat)} columns)\n  >> {columns_cat}\n")
print(f"Numeric Columns  ({len(columns_num)} columns)\n  >> {columns_num}\n")

# Category type data check
for column in columns_cat:
    print(f"<< {column} >>")
    print(df_train[column].value_counts())
    print('\n'*2)

# Count null data and total Sum
print(df_train.isnull().sum())
print()
print(df_train.isnull().sum().sum())

# Ratio of null included columns
too_many_null_columns = []

for column in columns_all:
    null_ratio = df_train[column].isnull().sum()/len(df_train[column])
    
    if null_ratio!= 0:
        if null_ratio>0.2:
            too_many_null_columns.append(column)
            
        print(f"<< {column} - Null ratio >>")
        print(f"{null_ratio*100:.2f}%")

print(f"\nColumns to delete because of Null ({len(too_many_null_columns)} columns) \n  >> {too_many_null_columns}")

# Null handled DataFrame
df_null = df_train

# Numeric column
for column in columns_num:
    Median = df_train[column].median()
    df_null[column].fillna(Median, inplace=True)

# Categorical column
for column in columns_cat:
    pass

# Drop columns that include too many null
df_null = df_train.drop(columns=too_many_null_columns)

# Drop null index 
df_null = df_null.dropna()
df_null.reset_index(drop=True, inplace=True)

# Check null replacement result
print(df_null.isnull().sum())
print()
print(df_null.isnull().sum().sum())

# Check DataFrame
df_null.info()

#========================================<3. Outliers & One-Hot Encoding>========================================
# Split columns list by type
columns_all2 = list(df_null.columns)
print(f"Total Columns  ({len(columns_all2)} columns)\n  >> {columns_all2}\n")

columns_num2 = []
columns_cat2 = []

for column in columns_all2:
    if df_train[column].dtype == "object" or df_train[column].dtype == "bool":
        columns_cat2.append(column)
    else:
        columns_num2.append(column)
        
print(f"Categorical Columns  ({len(columns_cat2)} columns)\n  >> {columns_cat2}\n")
print(f"Numeric Columns  ({len(columns_num2)} columns)\n  >> {columns_num2}\n")

# Seperate typical columns
df_cat = df_null[columns_cat2]
df_num = df_null[columns_num2]

# Check numeric data distribution (Outliers)
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)

# Set the target
target_column = 'SalePrice'
print("<< Outliers >>\n")

# Print outliers ratio
for column in columns_num2:
    q25, q75 = np.quantile(df_num[column], 0.25), np.quantile(df_num[column], 0.75)
    iqr = q75 - q25

    inner_fence = iqr * 1.5
    outer_fence = iqr * 3
    
    lower_limit, upper_limit = q25 - outer_fence, q75 + outer_fence

    lower_outlier_cnt = len(df_num[df_num[column]upper_limit])
    total_outlier_cnt = lower_outlier_cnt + upper_outlier_cnt
    ratio = total_outlier_cnt / len(df_num[column]) * 100

    median = np.median(df_num[column])

    # Replace with median value
    if column != target_column and total_outlier_cnt != 0:
        print(f"{column} | Low:{lower_outlier_cnt}ea | Upper:{upper_outlier_cnt}ea | Total:{total_outlier_cnt}ea | Ratio:{ratio:.2f}%")
        #df_num[df_num[column]upper_limit][column] = median

 # Categorical One-Hot-Encoding
df_cat_encoded = pd.get_dummies(df_cat, columns=columns_cat2)

# Count the dummy columns
df_cat_encoded.info()

# Merge DataFrame
df_merge = pd.concat([df_cat_encoded, df_num],axis=1)

# Merged columns
columns_all2 = list(df_merge.columns)
print(columns_all2, '\n')
print(f"DF_merge >> {len(columns_all2)} columns")

#========================================<4. Correlation Analysis>========================================
corr_data = df_merge.corr()

corr_columns_to_delete = []

# Print correlation values
print("<< Column under abs 0.1 in correlation >>\n")
for idx in range(1, len(columns_all2)):
    corr = corr_data["SalePrice"][idx]
    column = columns_all2[idx]
    if abs(corr)<0.1:
        print(f"[{column}] >> {corr}")
        corr_columns_to_delete.append(column)

# Columns to delete and total count
print(f'Total Dummy Columns To Delete  \n  >> {corr_columns_to_delete}\n')
print(f"Count >> {len(corr_columns_to_delete)} columns")


# Original names of dummies
corr_del_columns_origin = []

for column in corr_columns_to_delete:
    # strind indexing
    for idx in range(len(column)):
        if column[idx]=='_':
            # original name is index just before "_"
            break
    # if idx+1 == len(column), it means it's not dummy, so add column name as itself
    if idx+1 == len(column):
        idx += 1
        
    origin_column = column[:idx]

    if origin_column not in corr_del_columns_origin:
        corr_del_columns_origin.append(origin_column)

# Print columns count
print(f"Total Dummy Included DF >> {len(columns_all2)} columns")
print(f"Original Categorical DF >> {len(columns_cat2)} columns")
print()
print(f"Total Columns Cnt To Delete(+Numeric) >> {len(corr_del_columns_origin)} columns")

# Drop columns from Non-Dummy-DataFrame
df_corr = df_null.drop(columns=corr_del_columns_origin)

# Survived categorical columns from correlation analysis
columns_cat_left = []
for column in list(df_corr.columns):
    if df_null[column].dtype == "object":
        columns_cat_left.append(column)
        
print(f"Survived Categorical Columns  ({len(columns_cat_left)} columns)\n  >> {columns_cat_left}\n")

# Make dummy
df_corr = pd.get_dummies(df_corr, columns=columns_cat_left)

# Boolean data replace
df_corr.replace(True, 1, inplace=True)
df_corr.replace(False, 0, inplace=True)

# Print columns
df_corr

#========================================<5. Data Split-1 [Data and Label]>========================================
X = df_corr.drop(['SalePrice'], axis='columns')
y = df_corr['SalePrice'].reset_index(drop=True)

# Transform to scaling (series => DataFrame)
#y = y.to_frame()

print(f"X type >> {type(X)}")
print(f"y type >> {type(y)}")

print(f'X Shape >> {X.shape}')
print(f'y Shape >> {y.shape}')

#========================================<6. Scaling>========================================
Scaler = StandardScaler()

scaler_fit_X = Scaler.fit(X)
X_scaled = scaler_fit_X.transform(X)

#inverse_scaled_data = Scaler.inverse_transform(scaled_data)

#========================================<7. Data Split-2 [Train and Validation]>========================================
# No Scaled
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.1,
                                                      stratify=y,
                                                      random_state=24)

print(f'Train Set Shape >> X_train {X_train.shape}  |  y_train {y_train.shape}')
print(f'Valid Set Shape >> X_valid {X_valid.shape}  |  y_valid {y_valid.shape}')

#========================================<8. Test Data Load>========================================
test_Data_Path  = "./Data/Final/house_price/test.csv"
test_Label_Path = "./Data/Final/house_price/test_answer.csv"

X_test = pd.read_csv(test_Data_Path)
y_test = pd.read_csv(test_Label_Path)

print(f"Test Set Shape >> {X_test.shape}")

# Drop usless column
X_test = X_test.drop(columns='Id')

# Drop same columns as train set (high null ratio)
X_test = X_test.drop(columns=too_many_null_columns)

# Print null count
print(X_test.isnull().sum())
print()
print(X_test.isnull().sum().sum())

# Print columns count
columns_all3 = list(X_test.columns)
print(f"Test Total Columns  ({len(columns_all3)} columns)\n  >> {columns_all3}\n")

columns_cat3 = []
columns_num3 = []

for column in columns_all3:
    if X_test[column].dtype == "object" or X_test[column].dtype == "bool":
        columns_cat3.append(column)
    else:
        columns_num3.append(column)
        
print(f"Test Categorical Columns  ({len(columns_cat3)} columns)\n  >> {columns_cat3}\n")
print(f"Test Numeric Columns  ({len(columns_num3)} columns)\n  >> {columns_num3}\n")

# Null replacement by column type
for column in columns_num3:
    Median = X_test[column].median()
    X_test[column].fillna(Median, inplace=True)
    
for column in columns_cat3:
    Mode = X_test[column].mode()[0]
    X_test[column].fillna(Mode, inplace=True)

# Check null
print(X_test.isnull().sum())
print()
print(X_test.isnull().sum().sum())

# Drop same columns as train set (correlation)
X_test = X_test.drop(columns=corr_del_columns_origin)

# Make categorical dummy
X_test = pd.get_dummies(X_test, columns=columns_cat_left)

# Boolean handling
X_test.replace(True, 1, inplace=True)
X_test.replace(False, 0, inplace=True)

"""
# Scaled
scaler_fit_X = Scaler.fit(X_test)
X_test_scaled = scaler_fit_X.transform(X_test)
"""

X_test

X_test.info()

y_test = y_test['SalePrice']

print(f'X_test Shape >> {X_test.shape}')
print(f'y_test Shape >> {y_test.shape}')

#========================================<9. Machine Learning>========================================

#========================================<9.0. Comparison>========================================
model_dt = DecisionTreeRegressor(random_state=2024)
model_rf = RandomForestRegressor(random_state=2024)
model_lr = LogisticRegression(random_state=2024)
model_lgbm = LGBMRegressor(random_state=2024)
model_cat = CatBoostRegressor(random_state=2024)

# DecisionTree Classifier
model_dt.fit(X, y)
pred_dt = model_dt.predict(X_test)
mape_dt = mean_absolute_percentage_error(y_test, pred_dt)

# RandomForest Classifier
model_rf.fit(X, y)
pred_rf = model_rf.predict(X_test)
mape_rf = mean_absolute_percentage_error(y_test, pred_rf)

# Logistic Regression
model_lr.fit(X, y)
pred_lr = model_lr.predict(X_test)
mape_lr = mean_absolute_percentage_error(y_test, pred_lr)

# LGBM Classifier
model_lgbm.fit(X, y)
pred_lgbm = model_lgbm.predict(X_test)
mape_lgbm = mean_absolute_percentage_error(y_test, pred_lgbm)

# CatBoost Classifier
model_cat.fit(X, y)
pred_cat = model_cat.predict(X_test)
mape_cat = mean_absolute_percentage_error(y_test, pred_cat)

print(f'[DecisionTree Regressor] Loss MAPE >> {mape_dt*100:.2f}%\n')
print(f'[RandomForest Regressor] Loss MAPE >> {mape_rf*100:.2f}%\n')
print(f'[Logistic Regression] Loss MAPE >> {mape_lr*100:.2f}%\n')
print(f'[LGBM Regressor] Loss MAPE >> {mape_lgbm*100:.2f}%\n')
print(f'[CatBoost Regressor] Loss MAPE >> {mape_cat*100:.2f}%') 

#========================================<9.1. ML - Decision Tree Regressor - Grid Search>========================================
pipe_tree = make_pipeline(DecisionTreeRegressor(random_state=2024))

param_grid = {'decisiontreeregressor__max_depth': list(range(1, 30, 1)),
              'decisiontreeregressor__min_samples_leaf': list(range(1, 50, 5))}

gs = GridSearchCV(estimator = pipe_tree,
                 param_grid = param_grid,
                 scoring = 'neg_mean_absolute_error', 
                 cv = 10,
                 n_jobs= -1)

gs = gs.fit(X, y)
 
print(-gs.best_score_)
print(gs.best_params_)

model_dt_grid = gs.best_estimator_
model_dt_grid.fit(X, y)
 
pred_dt = model_dt_grid.predict(X_test)

print(f'[DecisionTree Regressor] [Grid Search] MAPE >> {mean_absolute_percentage_error(y_test, pred_dt)*100:.2f}%')

#========================================<9.2. ML - Random Forest Regressor - Grid Search>========================================
pipe_tree = make_pipeline(RandomForestRegressor(random_state=2024))

param_grid = { 'randomforestregressor__n_estimators': list(range(10, 50, 10)),
              'randomforestregressor__max_depth':  [None, 5, 10, 15, 20],
              'randomforestregressor__min_samples_split': [2, 5, 10],
              'randomforestregressor__min_samples_leaf': [1, 2, 4],
              'randomforestregressor__max_leaf_nodes': [None, 5, 10, 20],
              'randomforestregressor__bootstrap': [True, False]}
 
gs = GridSearchCV(estimator = pipe_tree,
                 param_grid = param_grid,
                 scoring = 'neg_mean_absolute_error',
                 n_jobs= -1)

gs = gs.fit(X, y)
 
print(-gs.best_score_)
print(gs.best_params_)

model_rf_grid = gs.best_estimator_
model_rf_grid.fit(X, y)
 
pred_rf = model_rf_grid.predict(X_test)

print(f'[RandomForest Regressor] [Grid Search] MAPE >> {mean_absolute_percentage_error(y_test, pred_rf)*100:.2f}%')

#========================================<9.3. ML - Logistic Regression - Grid Search>========================================
pipe_tree = make_pipeline(LogisticRegression(random_state=2024))

param_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10],
    'logisticregression__solver': ['liblinear', 'saga']
}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_, '\n')
print(gs.best_params_, '\n'*3)

model_lr_grid = gs.best_estimator_
model_lr_grid.fit(X, y)

pred_lr = model_lr_grid.predict(X_test)

print(f'[Logistic Regression] [Grid Search] MAPE >> {mean_absolute_percentage_error(y_test, pred_lr)*100:.2f}%')

#========================================<9.4. ML - LGBM Regressor - Grid Search>========================================
pipe_tree = make_pipeline(LGBMRegressor(random_state=2024))

param_grid = {
    'lgbmregressor__learning_rate': [0.1, 0.15, 0.2],
    'lgbmregressor__max_depth': [1, 5, 10, 20, 30],
    'lgbmregressor__num_leaves': [10, 20, 30, 50, 100],
    'lgbmregressor__n_estimators': [50, 100, 200],
    'lgbmregressor__min_child_weight': [10, 20, 30],
    'lgbmregressor__colsample_bytree': [0.5, 0.7, 1.0]
}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_)
print(gs.best_params_)

model_lgbm_grid = gs.best_estimator_
model_lgbm_grid.fit(X, y)
 
pred_lgbm = model_lgbm_grid.predict(X_test)

print(f'[LGBM Classifier] [Grid Search] MAPE >> {mean_absolute_percentage_error(y_test, pred_lgbm)*100:.2f}%')

#========================================<9.5. ML - Cat Boost Regressor - Grid Search>========================================
pipe_tree = make_pipeline(CatBoostRegressor(random_state=2024))

from sklearn.model_selection import GridSearchCV

param_grid = {
    'catboostregressor__depth': list(range(1,10,1)),
    'catboostregressor__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'catboostregressor__iterations': [100, 200, 300, 400, 500]
}

gs = GridSearchCV(pipe_tree, 
                  param_grid,
                  scoring = 'neg_mean_absolute_error',
                  cv=10,
                  n_jobs= -1)

gs.fit(X, y)

print(-gs.best_score_)
print(gs.best_params_)

model_cat_grid = gs.best_estimator_
model_cat_grid.fit(X, y)
 
pred_cat = model_cat_grid.predict(X_test)

print(f'[CatBoost Classifier] [Grid Search] MAPE >> {mean_absolute_percentage_error(y_test, pred_cat)*100:.2f}%')

#========================================<10. Deep Learning>========================================
tf.random.set_seed(2024)

model_DL = Sequential()

model_DL.add(Dense(units = 16, input_shape = (X_train.shape[1],), activation = 'relu'))
model_DL.add(Dense(units = 16, activation = 'relu'))
model_DL.add(Dense(units = 8, activation = 'relu'))
model_DL.add(Dense(units = 1 ))

model_DL.compile (loss = 'mse',
              optimizer = tf.keras.optimizers.Nadam(0.001),
              metrics = 'mse')

model_DL.summary()

es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
checkpoint_path = 'tmp_checkpoint.ckpt'
cp = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True)

with tf.device('/GPU:0'):
    history = model_DL.fit(X_train, y_train, 
                           epochs=100, 
                           batch_size=64, 
                           validation_data=(X_valid, y_valid), 
                           verbose =1,
                           callbacks=[es, cp])

# Loss graph
epochs = range(1, len(history.history['loss']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('Learning Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], )
plt.show()

# Numpy array
try:
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy()
    
except:
    print('Pass!')
    pass

with tf.device('/CPU:0'):
    test_mse, test_mae = model_DL.evaluate(X_test, y_test)
"""
print()
print(f'[Deep Learning] MSE >> {test_mse}\n')
print(f'[Deep Learning] MAE >> {test_mae}')
"""

# Predict
with tf.device('/CPU:0'):
    pred_DL = model_DL.predict(X_test)
    
"""
pred_DL_inverse = Scaler.inverse_transform(pred_DL)
"""

pred_DL

y_test

print(f'[Deep Learning] MAPE >> {mean_absolute_percentage_error(y_test, pred_DL)*100:.2f}%')

#========================================<11. Final Score>========================================
print(f'[DecisionTree Classifier] MAPE >> {mean_absolute_percentage_error(y_test, pred_dt)*100:.2f}%\n')
print(f'[RandomForest Classifier] MAPE >> {mean_absolute_percentage_error(y_test, pred_rf)*100:.2f}%\n')
print(f'[Logistic Regression] MAPE >> {mean_absolute_percentage_error(y_test, pred_lr)*100:.2f}%\n')
print(f'[LGBM Classifier] MAPE >> {mean_absolute_percentage_error(y_test, pred_lgbm)*100:.2f}%\n')
print(f'[CatBoost Classifier] MAPE >> {mean_absolute_percentage_error(y_test, pred_cat)*100:.2f}%\n') 
print(f'[Deep Learning] MAPE >> {mean_absolute_percentage_error(y_test, pred_DL)*100:.2f}%')

best_model = model_DL

# Save model from Machine Learning
"""
with open('best_model.pickle','wb') as fw:
    pickle.dump(best_model, fw)

with open('best_model.pickle','rb') as f:
    best_model = pickle.load(f)
"""

# Save model from Deep Learning
best_model.save('best_model.h5')
best_model = tf.keras.models.load_model('best_model.h5')

# Predict using best_model
y_pred_final = best_model.predict(X_test)

# Predict result Check
#y_pred_final[:5]

#========================================<12. Submit>========================================
test_Data_Path  = "./Data/Final/house_price/test.csv"
df_final = pd.read_csv(test_Data_Path)

df_final.insert(0, 'Predict', y_pred_final)

# Result Check
df_final.head()

# Save Result Data
df_final.to_csv('DF_Final.csv')
    

Comments

Popular posts from this blog

[Kaggle] Titanic Survivor Classification

Machine Learning ShootOut

[Kaggle] Pizza or Not Classification (Computer Vision)