NLP - Text Analysis with ML algorithms
- 1. Data Info
- 2. Preprocessing
- 3. Comparing between classification models
- 4. Balanced sampling approach - imblearn
- 5. Result: Best Model
import sys
import pandas as pd
import os
import numpy as np
import re
import random
import itertools
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
random.seed(1)
train_data=pd.read_csv('midterm_train.csv')
test_data=pd.read_csv('midterm_test.csv')
# evaluating function
def evaluate(test_x,test_y,model):
predictions=model.predict(test_x)
print(classification_report(test_y,predictions))
1. Data Info
train_data.head(3)
text | senti | |
---|---|---|
0 | J brand is by far the best premium denim line ... | pos |
1 | I loved this dress. i kept putting it on tryin... | pos |
2 | I found this at my local store and ended up bu... | pos |
print(train_data.groupby('senti').count())
print(test_data.groupby('senti').count())
print(train_data.describe())
print(test_data.describe())
text
senti
neg 2139
pos 11279
text
senti
neg 231
pos 1260
text senti
count 13418 13418
unique 13414 2
top Perfect fit and i've gotten so many compliment... pos
freq 2 11279
text senti
count 1491 1491
unique 1491 2
top Have to disagree with previous posters. i foun... pos
freq 1 1260
2. Preprocessing
2.1. duplicated data found in train_data
# remove duplicated data
print(train_data.text.duplicated().sum())
train_data = train_data.drop_duplicates(['text'],keep='first')
train_data.duplicated().sum()
4
0
print(train_data.describe())
text senti
count 13414 13414
unique 13414 2
top J brand is by far the best premium denim line ... pos
freq 1 11276
# train-test split unduplicated data
x_train=np.array(train_data.text)
x_test=np.array(test_data.text)
y_train=np.array(train_data.senti)
y_test=np.array(test_data.senti)
x_train[0]
'J brand is by far the best premium denim line retailer sells! the fit on these jeans is amazing..worth every penny..also, considering it is a crop jean - warm weather wear - the denim weight is light and not too thick...the color is different from ordinary regular denim blue..lighter wash for spring/summer!'
# preprocessing: remove non-alphabet characters
x_train_clean=np.array([re.sub('[^a-zA-Z]',' ',text) for text in x_train])
x_test_clean=np.array([re.sub('[^a-zA-Z]',' ',text) for text in x_test])
x_train_clean[0]
'J brand is by far the best premium denim line retailer sells the fit on these jeans is amazing worth every penny also considering it is a crop jean warm weather wear the denim weight is light and not too thick the color is different from ordinary regular denim blue lighter wash for spring summer '
3. Comparing between classification models
# tuning parameter sets
ngram_range= [(1, 1), (1, 2),(2,2)]
stop_words=[None,'english']
clf__alpha=[0.005,0.01,0.05,0.1]
params = dict(ngram_range=ngram_range,
stop_words=stop_words,
clf__alpha=clf__alpha)
keys=params.keys()
values = (params[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]
# tuning function
def tuning_model(params,vectorizer,classifier,x_train,x_test,y_train,y_test):
ngram_range=params['ngram_range']
stop_words=params['stop_words']
vec = vectorizer(ngram_range=ngram_range,stop_words=stop_words)
vec_train = vec.fit_transform(x_train)
vec_test = vec.transform(x_test)
if classifier==MultinomialNB:
alpha=params['clf__alpha']
clf=classifier(alpha)
else:
clf=classifier(random_state=1,max_iter=500)
clf.fit(vec_train, y_train)
pred=clf.predict(vec_test)
return f1_score(y_test,pred,average='macro'),params
# get best score & parameters
def get_result(combinations,vectorizer,classifier,x_train,x_test,y_train,y_test):
results=[]
for params in combinations:
results.append(tuning_model(params,vectorizer,classifier,x_train,x_test,y_train,y_test))
return max(results,key=lambda item: item[0])
3.1 With Tf-idf vectorizer
3.1.1 MultinomialNB
preprocessing seems to have no effect in ngram_range: (1, 2) and also stop words are not important features
f1-score macro avg: 0.85
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
get_result(combinations,TfidfVectorizer,MultinomialNB,x_train,x_test,y_train,y_test)
(0.8528815948449455,
{'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.01})
get_result(combinations,TfidfVectorizer,MultinomialNB,x_train_clean,x_test_clean,y_train,y_test)
(0.8528815948449455,
{'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.005})
3.1.2 LogisticRegression
from sklearn.linear_model import LogisticRegression
get_result(combinations,TfidfVectorizer,LogisticRegression,x_train,x_test,y_train,y_test)
(0.8811180515581001,
{'ngram_range': (1, 1), 'stop_words': None, 'clf__alpha': 0.005})
get_result(combinations,TfidfVectorizer,LogisticRegression,x_train_clean,x_test_clean,y_train,y_test)
(0.8823175752378818,
{'ngram_range': (1, 1), 'stop_words': None, 'clf__alpha': 0.005})
3.2 With CountVectorizer
3.2.1 MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
get_result(combinations,CountVectorizer,MultinomialNB,x_train,x_test,y_train,y_test)
(0.9366865264206945,
{'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.1})
get_result(combinations,CountVectorizer,MultinomialNB,x_train_clean,x_test_clean,y_train,y_test)
(0.935509934324805,
{'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.1})
4. Balanced sampling approach - imblearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
# tuning parameter sets for all cases
vectorizer=[CountVectorizer,TfidfVectorizer]
classifier=[MultinomialNB,LogisticRegression]
ngram_range= [(1, 1), (1, 2),(2,2)]
stop_words=[None,'english']
clf__alpha=[0.005,0.01,0.05,0.1]
params = dict(ngram_range=ngram_range,
vectorizer=vectorizer,
classifier=classifier,
stop_words=stop_words,
clf__alpha=clf__alpha)
keys=params.keys()
values = (params[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]
from imblearn.over_sampling import SMOTE
def smote_tuning(params,x_train,x_test,y_train,y_test):
ngram_range=params['ngram_range']
stop_words=params['stop_words']
classifier=params['classifier']
vec=params['vectorizer'](ngram_range=ngram_range,stop_words=stop_words)
x_train_imb=vec.fit_transform(x_train)
y_train_imb=np.array(train_data['senti']=='pos').astype('int')
y_test_imb=np.array(test_data['senti']=='pos').astype('int')
vec_train_over, y_train_over = SMOTE(random_state=1).fit_resample(x_train_imb,y_train_imb)
vec_test = vec.transform(x_test)
if classifier==MultinomialNB:
alpha=params['clf__alpha']
clf=classifier(alpha)
else:
clf=classifier(random_state=1,max_iter=500)
clf.fit(vec_train_over, y_train_over)
pred=clf.predict(vec_test)
return f1_score(y_test_imb,pred,average='macro'),params
# get best score & parameters
def get_smote_result(combinations,x_train,x_test,y_train,y_test):
results=[]
for params in combinations:
results.append(smote_tuning(params,x_train,x_test,y_train,y_test))
return max(results,key=lambda item: item[0])
get_smote_result(combinations,x_train,x_test,y_train,y_test)
(0.9285352359562871,
{'ngram_range': (1, 2),
'vectorizer': sklearn.feature_extraction.text.TfidfVectorizer,
'classifier': sklearn.naive_bayes.MultinomialNB,
'stop_words': None,
'clf__alpha': 0.1})
get_smote_result(combinations,x_train_clean,x_test_clean,y_train,y_test)
(0.9276401547886131,
{'ngram_range': (1, 2),
'vectorizer': sklearn.feature_extraction.text.TfidfVectorizer,
'classifier': sklearn.naive_bayes.MultinomialNB,
'stop_words': None,
'clf__alpha': 0.1})
5. Result: Best Model
# 'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.1}
vectorizer=CountVectorizer(ngram_range=(1,2))
vectors_train=vectorizer.fit_transform(x_train)
vectors_test=vectorizer.transform(x_test)
clf=MultinomialNB(alpha=0.1)
clf.fit(vectors_train,y_train)
evaluate(vectors_test,y_test,clf)
precision recall f1-score support
neg 0.90 0.88 0.89 231
pos 0.98 0.98 0.98 1260
accuracy 0.97 1491
macro avg 0.94 0.93 0.94 1491
weighted avg 0.97 0.97 0.97 1491