[Kaggle] Fashion Shopping Review Classification (NLP)

Load image files and make the Convolutional Deep learnig model.




"""
<< Index >>
- 0. Modules
- 1. Train Data Load
- 2. Data Preprocessing
  - 2.1. Null Data
  - 2.2. Language, Word filtering and check duplicated data
  - 2.3. Label Check
  - 2.4. Label Encoding
- 3. Data Split
  - 3.1. Features & Labels
  - 3.2. Train & Valid
4. Test Data Load
5. Machine Learning
  - 5.0. Total Comparison
  - 5.1. Naive Bayes Classification
  - 5.2. LSTM Model
6. Deep Learning
  - 6.1. Tokenizing & Padding
  - 6.2. LSTM Model
  - 6.3. Test Data
7. Final Score
8. Submit
"""


#========================================<0. Modules>========================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score, r2_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB 

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPool2D, Dropout
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, SimpleRNN, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

#========================================<1. Train Data Load>========================================
df_train = pd.read_csv('./Data/Final/fashion_review/train.csv')

df_train.head()

df_train.info()

#========================================<2. Data Preprocessing>========================================

#========================================<2.1. Null Data>========================================
df_train.isnull().sum()

# Print null ratio
print(f"Null Ratio >> {df_train['Review Text'].isnull().sum() / len(df_train)*100:.2f}%")

# Drop null
df_train = df_train.dropna()
df_train.reset_index(drop=True, inplace=True)

# Check again
df_train.isnull().sum()

df_train.info()

#========================================<2.2. Language, Word filtering and check duplicated data>========================================
# Print sentence contain Eng
df_train[df_train['Review Text'].str.contains('[^A-Za-z]')].values[:10]

# Drop not Eng
df_train['Review Text'] = df_train['Review Text'].str.replace('[^A-Za-z]','')

# Delete space before and after
df_train['Review Text'] = df_train['Review Text'].str.strip()

# Count Duplicated Data 
df_train['Review Text'].duplicated().sum()

# Duplicated data ratio
print(f"{df_train['Review Text'].duplicated().sum() / len(df_train['Review Text']) * 100:.2f}%")

# Drop duplicates 
df_train = df_train.drop_duplicates(['Review Text'], keep = 'first')

# Check again
df_train['Review Text'].duplicated().sum()

#========================================<2.3. Label Check>========================================
df_train['Rating'].value_counts()

# Plot bar graph
df_train['Rating'].value_counts().plot(kind='bar') 

# Plot circle graph
df_train['Rating'].value_counts().plot(kind='pie')

#========================================<2.4. Label Encoding>========================================
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_train['Rating'] = encoder.fit_transform(df_train['Rating'])

element_list = sorted(df_train['Rating'].unique())

print("Before Encoding >>", encoder.classes_)
print(" After Encoding >>", element_list)

# Class num 
class_num = len(element_list)
print(class_num)

#========================================<3. Data Split>========================================

#========================================<3.1. Features & Labels>========================================
X = df_train['Review Text'].values
y = df_train['Rating'].values

print(f"Feature data shape : {X.shape}\nLabel data shape : {y.shape}")

print(f'Maximum Sentence Length :{max(len(l) for l in X)}')
print(f'Average Sentence Length  :{sum(map(len, X))/len(X)}')

# Data distribution graph
plt.hist([len(s) for s in X], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

#========================================<3.2. Train & Valid>========================================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=24)

print(f'Train Dataset >> X_train {X_train.shape}, y_train {y_train.shape}')
print(f'Valid Dataset >> X_valid {X_valid.shape}, y_valid {y_valid.shape}')

#Before Encoding >> ['Bad' 'Good' 'Neutral' 'VeryBad' 'VeryGood']
#After Encoding >> [0, 1, 2, 3, 4]

for i in range(3):
    print(X_train[i])
    print(y_train[i])
    print()

#========================================<4. Test Data Load>========================================
df_test= pd.read_csv('./Data/Final/fashion_review/test.csv')

len(df_test)

print("Before Null Replacement")
print(df_test.isnull().sum())

# Replace null (do not delete)
df_test = df_test.fillna("")

print("\nAfter Null Replacement")
print(df_test.isnull().sum())

# Label encoding
df_test['Rating'] = encoder.fit_transform(df_test['Rating'])

element_list_test = sorted(df_test['Rating'].unique())

print("Before Encoding >>", encoder.classes_)
print(" After Encoding >>", element_list)
print()

X_test = df_test['Review Text'].values
y_test = df_test['Rating'].values

print(f"Feature data shape : {X_test.shape}\nLabel data shape : {y_test.shape}")

#========================================<5. Machine Learning>========================================

#========================================<5.0. Total Comparison>========================================
# ※ TFIDF Vectorizing
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

#print(X_tfidf)

# TF-IDF Matrix (Line, Words)
X_tfidf.shape

X_test_v = tfidf.transform(X_test)

model_dt = DecisionTreeClassifier(random_state=2024)
model_rf = RandomForestClassifier(random_state=2024)
model_lr = LogisticRegression(random_state=2024)
model_lgbm = LGBMClassifier(random_state=2024)
model_cat = CatBoostClassifier(random_state=2024)

# DecisionTree Classifier
model_dt.fit(X_tfidf, y)
pred_dt = model_dt.predict(X_test_v)
acc_dt = accuracy_score(y_test, pred_dt)

# RandomForest Classifier
model_rf.fit(X_tfidf, y)
pred_rf = model_rf.predict(X_test_v)
acc_rf = accuracy_score(y_test, pred_rf)

# Logistic Regression
model_lr.fit(X_tfidf, y)
pred_lr = model_lr.predict(X_test_v)
acc_lr = accuracy_score(y_test, pred_lr)

# LGBM Classifier
model_lgbm.fit(X_tfidf, y)
pred_lgbm = model_lgbm.predict(X_test_v)
acc_lgbm = accuracy_score(y_test, pred_lgbm)

# about 7 minutes
"""
# CatBoost Classifier
model_cat.fit(X_tfidf, y)
pred_cat = model_cat.predict(X_test_v)
acc_cat = accuracy_score(y_test, pred_cat)
"""

print(f'[DecisionTree Classifier]\n{classification_report(y_test, pred_dt)}\n')
print(f'[RandomForest Classifier]\n{classification_report(y_test, pred_rf)}\n')
print(f'[Logistic Regression]\n{classification_report(y_test, pred_lr)}\n')
print(f'[LGBM Classifier]\n{classification_report(y_test, pred_lgbm)}\n')
#print(f'[CatBoost Classifier]\n{classification_report(y_test, pred_cat)}')

#========================================<5.1. Naive Bayes Classification>========================================
# DTM(=Document-Term Matrix)
dtmvector = CountVectorizer()
X_dtm = dtmvector.fit_transform(X)
print(X_dtm.shape)

# TF-IDF transform
tfidf_transformer = TfidfTransformer()
X_dtm_tfidf = tfidf_transformer.fit_transform(X_dtm)
print(X_dtm_tfidf.shape)

model_nb = MultinomialNB()
model_nb.fit(X_dtm_tfidf, y)

MultinomialNB(alpha=10, class_prior=None, fit_prior=True)

# Test score
X_test_dtm = dtmvector.transform(X_test) 
X_test_dtm_tfidf = tfidf_transformer.transform(X_test_dtm)

y_test = encoder.fit_transform(y_test)

# Print score
pred_nb = model_nb.predict(X_test_dtm_tfidf)
acc_nb = accuracy_score(y_test, pred_nb)
print(f"[Naive Bayes] Accuracy >> {acc_nb*100:.2f}%") 

#========================================<5.2. Support Vector Machine>========================================
# Tokenizer : fit_on_texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X_token_seq = tokenizer.texts_to_sequences(X)

# Maximun length = 116
max_len = max(len(line) for line in X_token_seq)

X_token_pad = pad_sequences(X_token_seq, maxlen=max_len)

# About 2 minutes
model_svm = SVC()
model_svm.fit(X_token_pad, y) 

# Test score
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

pred_svm = model_svm.predict(X_test_pad)
acc_svm = accuracy_score(y_test, pred_svm)

# Counfusion matrix
print("\n", classification_report(y_test, pred_svm))

# Print score
print(f"[Support Vector Machine] Accuracy >> {acc_svm*100:.2f}%") 

#========================================<6. Deep Learning>========================================

#========================================<6.1. Tokenizing & Padding>========================================

# Tokenizer (fit_on_texts)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

"""
print(tokenizer.word_index)
print(tokenizer.index_word)
print(tokenizer.word_counts)
"""

# Count max_words length
max_words = len(tokenizer.index_word)
print(max_words)

# Sentence to array
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_valid_seq = tokenizer.texts_to_sequences(X_valid)

print(len(X_train_seq), len(X_valid_seq))

print(X_train[0])
print(X_train_seq[0])

# Maximun Length = 116
max_len = max(len(line) for line in X_train_seq)
print(max_len)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_len)

# Data check
X_train_pad[:1]

# print data shape
print(f"X_train_pad shape : {X_train_pad.shape}\nX_valid_pad : {X_valid_pad.shape}")

#========================================<6.2. LSTM Model>========================================
# Parameter setting
max_words = max_words + 1    # max_words + padding 0
embedding_dim = 32

tf.random.set_seed(2024)

# Make sequential model
model_DL = Sequential()

model_DL.add(Embedding(max_words, embedding_dim, input_length=max_len))

# LSTM Model
model_DL.add(LSTM(8, activation='leaky_relu', return_sequences=True))
#model_DL.add(Dropout(0.5))
#model_DL.add(LSTM(8, activation='leaky_relu', return_sequences=True))
model_DL.add(Flatten())
model_DL.add(Dense(8, activation='leaky_relu'))
model_DL.add(Dropout(0.3))
#model_DL.add(Dense(8, activation='leaky_relu'))
#model_DL.add(Dropout(0.3))
model_DL.add(Dense(class_num, activation='softmax')) 

# model compile 'sparse_categorical_crossentropy'
model_DL.compile (loss = 'sparse_categorical_crossentropy',
              optimizer = tf.keras.optimizers.Nadam(0.001),
              metrics = 'accuracy')

model_DL.summary()

# EarlyStopping
es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

# ModelCheckpoint
checkpoint_path = 'tmp_checkpoint.ckpt'
cp = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True)

# Model fit
with tf.device('/CPU:0'):
    history = model_DL.fit(X_train_pad, y_train, 
                        epochs=30, 
                        batch_size=256, 
                        validation_data=(X_valid_pad, y_valid), 
                        verbose =1, 
                        callbacks=[es, cp])

# Accuracy graph
epochs = range(1, len(history.history['accuracy']) + 1)
plt.plot(epochs, history.history['accuracy'])
plt.plot(epochs, history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], )
plt.show()

# Loss graph
epochs = range(1, len(history.history['loss']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('Learning Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], )
plt.show()

#========================================<6.3. Test Data>========================================
df_test= pd.read_csv('./Data/Final/fashion_review/test.csv')

df_test = df_test.fillna("")

# Null replacement
print("Before Null Replacement")
print(df_test.isnull().sum())

# Replace null (do not delete)
df_test = df_test.fillna("")

print("\nAfter Null Replacement")
print(df_test.isnull().sum())

df_test['Review Text'] = df_test['Review Text'].str.replace('[^A-Za-z ]','')
df_test['Review Text'] = df_test['Review Text'].str.strip()

X_test = df_test['Review Text']
y_test = df_test['Rating']

tokenizer.fit_on_texts(X_test)

X_test_seq = tokenizer.texts_to_sequences(X_test)

X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Label encoding
y_test = encoder.fit_transform(y_test)

element_list_test = sorted(y_test.unique())

print("Before Encoding >>", encoder.classes_)
print(" After Encoding >>", element_list)
print()

# Evaluate
with tf.device('/CPU:0'):
    acc_DL =  model_DL.evaluate(X_test_pad, y_test)[1]

print(f'\nTest Accuracy >> {acc_DL*100:.2f}%')
# [0]:Loss, [1]:Score

# predict
with tf.device('/CPU:0'):
    pred_DL = model_DL.predict(X_test_pad)

# Check predict result
print(np.argmax(pred_DL[3]))
print(y_test[3])

#Before Encoding >> ['Bad' 'Good' 'Neutral' 'VeryBad' 'VeryGood']
#After Encoding >> [0, 1, 2, 3, 4]
print(f'True >> {y_test[3]}')
print(f'Predict >> {np.argmax(pred_DL[3])}')
print(f"Original Document >> {X_test[3]}")

#========================================<7. Final Score>========================================
print(f'[DecisionTree Classifier] Accuracy >> {acc_dt*100:.2f}%\n')
print(f'[RandomForest Classifier] Accuracy >> {acc_rf*100:.2f}%\n')
print(f'[Logistic Regression] Accuracy >> {acc_lr*100:.2f}%\n')
print(f'[LGBM Classifier] Accuracy >> {acc_lgbm*100:.2f}%\n')

print(f"[Naive Bayes] Accuracy >> {acc_nb*100:.2f}%\n")

print(f'[Deep Learning] Accuracy >> {acc_DL*100:.2f}%')

# Select best_model
best_model = model_lr

# Machine Learning
import pickle
with open('best_model.pickle','wb') as fw:
    pickle.dump(best_model, fw)

with open('best_model.pickle','rb') as f:
    best_model = pickle.load(f)

# Deep Learning
"""
model.save('best_model.h5')
best_model = tf.keras.models.load_model('best_model.h5')
"""
#========================================<8. Submit>========================================
df_final = pd.read_csv('./Data/Final/fashion_review/test.csv')

#df_final = df_final.drop('Unnamed: 0', axis='columns')

# Check
pred_lr

# Inverse transform
pred_lr_inverse = encoder.inverse_transform(pred_lr)

# Check
pred_lr_inverse

df_final.insert(0, 'Predict', pred_lr_inverse)
df_final.to_csv('DF_Final.csv')

# Check
df_final


Comments

Popular posts from this blog

[Kaggle] Titanic Survivor Classification

[Kaggle] Pizza or Not Classification (Computer Vision)

Machine Learning ShootOut