LSTM model using text data
Deep learning example code with text data
# Searching letters
df['column'][df['column'].str.contains('[^a-z ]')].sum()
# Replace letters
df['column'] = df['column'].str.replace('[^a-z ]','')
# delete sentence's head and tail's empty word
df['column'] = df['column'].str.strip()
# search duplicate data
df['column'].duplicated().sum()
# drop duplicate data
df.drop_duplicates(subset=['column'], inplace=True)
# Graph using value_counts()
df['column'].value_counts().plot(kind='bar')
# Split X and Y
features = df['column'].values
labels = df['column'].values
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x_train_v = tfidf.fit_transform(x_train)
x_test_v = tfidf.transform(x_test)
# Tokenizing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
# word:number mapping
tokenizer.index_word
# total word count
len(tokenizer.index_word)
# change sentence to array
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)
# sentence's maximun lengh
max(len(line) for line in x_train_seq)
# Set all sentence length to maximun length
x_train_pad = pad_sequences(x_train_seq, maxlen=38)
x_test_pad = pad_sequences(x_test_seq, maxlen=38)
# HyperParameter
max_words = 47646 + 1 # add padding
max_len = 38~~
embedding_dim = 32~~
# model's first layer embedding
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
# LSTM layer (after 1st input layer)
model.add(LSTM(16, return_sequences=True))
model.add(LSTM(16, return_sequences=True))
model.add(Flatten())
model.add(Dense(128, activation='swish'))
model.add(Dense(32, activation='swish'))
model.add(Dense(6, activation='softmax'))
# Model Compile
model.compile (loss = 'sparse_categorical_crossentropy',
optimizer = 'adam',
metrics = 'accuracy')
model.summary()
# EarlyStopping
es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
# Set ModelCheckpoint
checkpoint_path = 'tmp_checkpoint.ckpt'
cp = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True)
# model fit
history = model.fit(x_train_pad, y_train, epochs=50, batch_size=512,
validation_split=0.2, verbose =1, callbacks=[es, cp])
# model evaluate using test dataset
model.evaluate(x_test_pad, y_test)
# Predict using model
predict = model.predict(x_test_pad[:1])
Comments
Post a Comment