LSTM model using text data

Deep learning example code with text data




# Searching letters
df['column'][df['column'].str.contains('[^a-z ]')].sum()

# Replace letters
df['column'] = df['column'].str.replace('[^a-z ]','')

# delete sentence's head and tail's empty word
df['column'] = df['column'].str.strip()

# search duplicate data
df['column'].duplicated().sum()

# drop duplicate data
df.drop_duplicates(subset=['column'], inplace=True)

# Graph using value_counts()
df['column'].value_counts().plot(kind='bar')

# Split X and Y
features = df['column'].values
labels = df['column'].values

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x_train_v = tfidf.fit_transform(x_train)
x_test_v = tfidf.transform(x_test)

# Tokenizing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

# word:number mapping
tokenizer.index_word

# total word count
len(tokenizer.index_word)

# change sentence to array
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# sentence's maximun lengh
max(len(line) for line in x_train_seq)

# Set all sentence length to maximun length
x_train_pad = pad_sequences(x_train_seq, maxlen=38)
x_test_pad = pad_sequences(x_test_seq, maxlen=38)

# HyperParameter
max_words = 47646 + 1    # add padding
max_len = 38~~ 
embedding_dim = 32~~

# model's first layer embedding 
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
# LSTM layer (after 1st input layer)
model.add(LSTM(16, return_sequences=True))
model.add(LSTM(16, return_sequences=True))
model.add(Flatten())
model.add(Dense(128, activation='swish'))
model.add(Dense(32, activation='swish'))
model.add(Dense(6, activation='softmax'))

# Model Compile 
model.compile (loss = 'sparse_categorical_crossentropy',
              optimizer = 'adam',
              metrics = 'accuracy')
model.summary()

# EarlyStopping 
es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

# Set ModelCheckpoint
checkpoint_path = 'tmp_checkpoint.ckpt'
cp = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True)

# model fit
history = model.fit(x_train_pad, y_train, epochs=50, batch_size=512,
                      validation_split=0.2, verbose =1, callbacks=[es, cp])

# model evaluate using test dataset
model.evaluate(x_test_pad, y_test)

# Predict using model
predict = model.predict(x_test_pad[:1])
    

Comments

Popular posts from this blog

[Kaggle] Titanic Survivor Classification

Machine Learning ShootOut

[Kaggle] Pizza or Not Classification (Computer Vision)