[Kaggle] Fashion Shopping Review Classification (NLP)
Load image files and make the Convolutional Deep learnig model.
"""
<< Index >>
- 0. Modules
- 1. Train Data Load
- 2. Data Preprocessing
- 2.1. Null Data
- 2.2. Language, Word filtering and check duplicated data
- 2.3. Label Check
- 2.4. Label Encoding
- 3. Data Split
- 3.1. Features & Labels
- 3.2. Train & Valid
4. Test Data Load
5. Machine Learning
- 5.0. Total Comparison
- 5.1. Naive Bayes Classification
- 5.2. LSTM Model
6. Deep Learning
- 6.1. Tokenizing & Padding
- 6.2. LSTM Model
- 6.3. Test Data
7. Final Score
8. Submit
"""
#========================================<0. Modules>========================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPool2D, Dropout
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, SimpleRNN, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
#========================================<1. Train Data Load>========================================
df_train = pd.read_csv('./Data/Final/fashion_review/train.csv')
df_train.head()
df_train.info()
#========================================<2. Data Preprocessing>========================================
#========================================<2.1. Null Data>========================================
df_train.isnull().sum()
# Print null ratio
print(f"Null Ratio >> {df_train['Review Text'].isnull().sum() / len(df_train)*100:.2f}%")
# Drop null
df_train = df_train.dropna()
df_train.reset_index(drop=True, inplace=True)
# Check again
df_train.isnull().sum()
df_train.info()
#========================================<2.2. Language, Word filtering and check duplicated data>========================================
# Print sentence contain Eng
df_train[df_train['Review Text'].str.contains('[^A-Za-z]')].values[:10]
# Drop not Eng
df_train['Review Text'] = df_train['Review Text'].str.replace('[^A-Za-z]','')
# Delete space before and after
df_train['Review Text'] = df_train['Review Text'].str.strip()
# Count Duplicated Data
df_train['Review Text'].duplicated().sum()
# Duplicated data ratio
print(f"{df_train['Review Text'].duplicated().sum() / len(df_train['Review Text']) * 100:.2f}%")
# Drop duplicates
df_train = df_train.drop_duplicates(['Review Text'], keep = 'first')
# Check again
df_train['Review Text'].duplicated().sum()
#========================================<2.3. Label Check>========================================
df_train['Rating'].value_counts()
# Plot bar graph
df_train['Rating'].value_counts().plot(kind='bar')
# Plot circle graph
df_train['Rating'].value_counts().plot(kind='pie')
#========================================<2.4. Label Encoding>========================================
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df_train['Rating'] = encoder.fit_transform(df_train['Rating'])
element_list = sorted(df_train['Rating'].unique())
print("Before Encoding >>", encoder.classes_)
print(" After Encoding >>", element_list)
# Class num
class_num = len(element_list)
print(class_num)
#========================================<3. Data Split>========================================
#========================================<3.1. Features & Labels>========================================
X = df_train['Review Text'].values
y = df_train['Rating'].values
print(f"Feature data shape : {X.shape}\nLabel data shape : {y.shape}")
print(f'Maximum Sentence Length :{max(len(l) for l in X)}')
print(f'Average Sentence Length :{sum(map(len, X))/len(X)}')
# Data distribution graph
plt.hist([len(s) for s in X], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()
#========================================<3.2. Train & Valid>========================================
X_train, X_valid, y_train, y_valid = train_test_split(
X, y,
test_size=0.2,
stratify=y,
random_state=24)
print(f'Train Dataset >> X_train {X_train.shape}, y_train {y_train.shape}')
print(f'Valid Dataset >> X_valid {X_valid.shape}, y_valid {y_valid.shape}')
#Before Encoding >> ['Bad' 'Good' 'Neutral' 'VeryBad' 'VeryGood']
#After Encoding >> [0, 1, 2, 3, 4]
for i in range(3):
print(X_train[i])
print(y_train[i])
print()
#========================================<4. Test Data Load>========================================
df_test= pd.read_csv('./Data/Final/fashion_review/test.csv')
len(df_test)
print("Before Null Replacement")
print(df_test.isnull().sum())
# Replace null (do not delete)
df_test = df_test.fillna("")
print("\nAfter Null Replacement")
print(df_test.isnull().sum())
# Label encoding
df_test['Rating'] = encoder.fit_transform(df_test['Rating'])
element_list_test = sorted(df_test['Rating'].unique())
print("Before Encoding >>", encoder.classes_)
print(" After Encoding >>", element_list)
print()
X_test = df_test['Review Text'].values
y_test = df_test['Rating'].values
print(f"Feature data shape : {X_test.shape}\nLabel data shape : {y_test.shape}")
#========================================<5. Machine Learning>========================================
#========================================<5.0. Total Comparison>========================================
# ※ TFIDF Vectorizing
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)
#print(X_tfidf)
# TF-IDF Matrix (Line, Words)
X_tfidf.shape
X_test_v = tfidf.transform(X_test)
model_dt = DecisionTreeClassifier(random_state=2024)
model_rf = RandomForestClassifier(random_state=2024)
model_lr = LogisticRegression(random_state=2024)
model_lgbm = LGBMClassifier(random_state=2024)
model_cat = CatBoostClassifier(random_state=2024)
# DecisionTree Classifier
model_dt.fit(X_tfidf, y)
pred_dt = model_dt.predict(X_test_v)
acc_dt = accuracy_score(y_test, pred_dt)
# RandomForest Classifier
model_rf.fit(X_tfidf, y)
pred_rf = model_rf.predict(X_test_v)
acc_rf = accuracy_score(y_test, pred_rf)
# Logistic Regression
model_lr.fit(X_tfidf, y)
pred_lr = model_lr.predict(X_test_v)
acc_lr = accuracy_score(y_test, pred_lr)
# LGBM Classifier
model_lgbm.fit(X_tfidf, y)
pred_lgbm = model_lgbm.predict(X_test_v)
acc_lgbm = accuracy_score(y_test, pred_lgbm)
# about 7 minutes
"""
# CatBoost Classifier
model_cat.fit(X_tfidf, y)
pred_cat = model_cat.predict(X_test_v)
acc_cat = accuracy_score(y_test, pred_cat)
"""
print(f'[DecisionTree Classifier]\n{classification_report(y_test, pred_dt)}\n')
print(f'[RandomForest Classifier]\n{classification_report(y_test, pred_rf)}\n')
print(f'[Logistic Regression]\n{classification_report(y_test, pred_lr)}\n')
print(f'[LGBM Classifier]\n{classification_report(y_test, pred_lgbm)}\n')
#print(f'[CatBoost Classifier]\n{classification_report(y_test, pred_cat)}')
#========================================<5.1. Naive Bayes Classification>========================================
# DTM(=Document-Term Matrix)
dtmvector = CountVectorizer()
X_dtm = dtmvector.fit_transform(X)
print(X_dtm.shape)
# TF-IDF transform
tfidf_transformer = TfidfTransformer()
X_dtm_tfidf = tfidf_transformer.fit_transform(X_dtm)
print(X_dtm_tfidf.shape)
model_nb = MultinomialNB()
model_nb.fit(X_dtm_tfidf, y)
MultinomialNB(alpha=10, class_prior=None, fit_prior=True)
# Test score
X_test_dtm = dtmvector.transform(X_test)
X_test_dtm_tfidf = tfidf_transformer.transform(X_test_dtm)
y_test = encoder.fit_transform(y_test)
# Print score
pred_nb = model_nb.predict(X_test_dtm_tfidf)
acc_nb = accuracy_score(y_test, pred_nb)
print(f"[Naive Bayes] Accuracy >> {acc_nb*100:.2f}%")
#========================================<5.2. Support Vector Machine>========================================
# Tokenizer : fit_on_texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_token_seq = tokenizer.texts_to_sequences(X)
# Maximun length = 116
max_len = max(len(line) for line in X_token_seq)
X_token_pad = pad_sequences(X_token_seq, maxlen=max_len)
# About 2 minutes
model_svm = SVC()
model_svm.fit(X_token_pad, y)
# Test score
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)
pred_svm = model_svm.predict(X_test_pad)
acc_svm = accuracy_score(y_test, pred_svm)
# Counfusion matrix
print("
Comments
Post a Comment