/ PROJECTS

NLP - Text Analysis with ML algorithms

import sys
import pandas as pd
import os
import numpy as np
import re
import random
import itertools
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
random.seed(1)
train_data=pd.read_csv('midterm_train.csv')
test_data=pd.read_csv('midterm_test.csv')
# evaluating function
def evaluate(test_x,test_y,model):
    predictions=model.predict(test_x)
    print(classification_report(test_y,predictions))

1. Data Info

train_data.head(3)
text senti
0 J brand is by far the best premium denim line ... pos
1 I loved this dress. i kept putting it on tryin... pos
2 I found this at my local store and ended up bu... pos
print(train_data.groupby('senti').count())
print(test_data.groupby('senti').count())
print(train_data.describe())
print(test_data.describe())
        text
senti       
neg     2139
pos    11279
       text
senti      
neg     231
pos    1260
                                                     text  senti
count                                               13418  13418
unique                                              13414      2
top     Perfect fit and i've gotten so many compliment...    pos
freq                                                    2  11279
                                                     text senti
count                                                1491  1491
unique                                               1491     2
top     Have to disagree with previous posters. i foun...   pos
freq                                                    1  1260

2. Preprocessing

2.1. duplicated data found in train_data

# remove duplicated data
print(train_data.text.duplicated().sum())
train_data = train_data.drop_duplicates(['text'],keep='first')
train_data.duplicated().sum()
4

0
print(train_data.describe())
                                                     text  senti
count                                               13414  13414
unique                                              13414      2
top     J brand is by far the best premium denim line ...    pos
freq                                                    1  11276
# train-test split unduplicated data
x_train=np.array(train_data.text)
x_test=np.array(test_data.text)
y_train=np.array(train_data.senti)
y_test=np.array(test_data.senti)
x_train[0]
'J brand is by far the best premium denim line retailer sells! the fit on these jeans is amazing..worth every penny..also, considering it is a crop jean - warm weather wear - the denim weight is light and not too thick...the color is different from ordinary regular denim blue..lighter wash for spring/summer!'
# preprocessing: remove non-alphabet characters
x_train_clean=np.array([re.sub('[^a-zA-Z]',' ',text) for text in x_train])
x_test_clean=np.array([re.sub('[^a-zA-Z]',' ',text) for text in x_test])
x_train_clean[0]
'J brand is by far the best premium denim line retailer sells  the fit on these jeans is amazing  worth every penny  also  considering it is a crop jean   warm weather wear   the denim weight is light and not too thick   the color is different from ordinary regular denim blue  lighter wash for spring summer '

3. Comparing between classification models

# tuning parameter sets
ngram_range= [(1, 1), (1, 2),(2,2)]
stop_words=[None,'english']
clf__alpha=[0.005,0.01,0.05,0.1]

params = dict(ngram_range=ngram_range,
              stop_words=stop_words,
              clf__alpha=clf__alpha)
keys=params.keys()
values = (params[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]
# tuning function
def tuning_model(params,vectorizer,classifier,x_train,x_test,y_train,y_test):
    ngram_range=params['ngram_range']
    stop_words=params['stop_words']
    vec = vectorizer(ngram_range=ngram_range,stop_words=stop_words)
    vec_train = vec.fit_transform(x_train)
    vec_test = vec.transform(x_test)
    if classifier==MultinomialNB:
        alpha=params['clf__alpha']
        clf=classifier(alpha)
    else:
        clf=classifier(random_state=1,max_iter=500)
    clf.fit(vec_train, y_train)
    pred=clf.predict(vec_test)
    return f1_score(y_test,pred,average='macro'),params
# get best score & parameters
def get_result(combinations,vectorizer,classifier,x_train,x_test,y_train,y_test):
    results=[]
    for params in combinations:
        results.append(tuning_model(params,vectorizer,classifier,x_train,x_test,y_train,y_test))
    return max(results,key=lambda item: item[0])

3.1 With Tf-idf vectorizer

3.1.1 MultinomialNB

preprocessing seems to have no effect in ngram_range: (1, 2) and also stop words are not important features
f1-score macro avg: 0.85

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
get_result(combinations,TfidfVectorizer,MultinomialNB,x_train,x_test,y_train,y_test)
(0.8528815948449455,
 {'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.01})
get_result(combinations,TfidfVectorizer,MultinomialNB,x_train_clean,x_test_clean,y_train,y_test)
(0.8528815948449455,
 {'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.005})

3.1.2 LogisticRegression

from sklearn.linear_model import LogisticRegression
get_result(combinations,TfidfVectorizer,LogisticRegression,x_train,x_test,y_train,y_test)
(0.8811180515581001,
 {'ngram_range': (1, 1), 'stop_words': None, 'clf__alpha': 0.005})
get_result(combinations,TfidfVectorizer,LogisticRegression,x_train_clean,x_test_clean,y_train,y_test)
(0.8823175752378818,
 {'ngram_range': (1, 1), 'stop_words': None, 'clf__alpha': 0.005})

3.2 With CountVectorizer

3.2.1 MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
get_result(combinations,CountVectorizer,MultinomialNB,x_train,x_test,y_train,y_test)
(0.9366865264206945,
 {'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.1})
get_result(combinations,CountVectorizer,MultinomialNB,x_train_clean,x_test_clean,y_train,y_test)
(0.935509934324805,
 {'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.1})

4. Balanced sampling approach - imblearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# tuning parameter sets for all cases
vectorizer=[CountVectorizer,TfidfVectorizer]
classifier=[MultinomialNB,LogisticRegression]
ngram_range= [(1, 1), (1, 2),(2,2)]
stop_words=[None,'english']
clf__alpha=[0.005,0.01,0.05,0.1]

params = dict(ngram_range=ngram_range,
              vectorizer=vectorizer,
              classifier=classifier,
              stop_words=stop_words,
              clf__alpha=clf__alpha)
keys=params.keys()
values = (params[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]
from imblearn.over_sampling import SMOTE
def smote_tuning(params,x_train,x_test,y_train,y_test):
    ngram_range=params['ngram_range']
    stop_words=params['stop_words']
    classifier=params['classifier']
    vec=params['vectorizer'](ngram_range=ngram_range,stop_words=stop_words)
    x_train_imb=vec.fit_transform(x_train)
    y_train_imb=np.array(train_data['senti']=='pos').astype('int')
    y_test_imb=np.array(test_data['senti']=='pos').astype('int')
    vec_train_over, y_train_over = SMOTE(random_state=1).fit_resample(x_train_imb,y_train_imb)
    vec_test = vec.transform(x_test)
    if classifier==MultinomialNB:
        alpha=params['clf__alpha']
        clf=classifier(alpha)
    else:
        clf=classifier(random_state=1,max_iter=500)
    clf.fit(vec_train_over, y_train_over)
    pred=clf.predict(vec_test)
    return f1_score(y_test_imb,pred,average='macro'),params
# get best score & parameters
def get_smote_result(combinations,x_train,x_test,y_train,y_test):
    results=[]
    for params in combinations:
        results.append(smote_tuning(params,x_train,x_test,y_train,y_test))
    return max(results,key=lambda item: item[0])
get_smote_result(combinations,x_train,x_test,y_train,y_test)
(0.9285352359562871,
 {'ngram_range': (1, 2),
  'vectorizer': sklearn.feature_extraction.text.TfidfVectorizer,
  'classifier': sklearn.naive_bayes.MultinomialNB,
  'stop_words': None,
  'clf__alpha': 0.1})
get_smote_result(combinations,x_train_clean,x_test_clean,y_train,y_test)
(0.9276401547886131,
 {'ngram_range': (1, 2),
  'vectorizer': sklearn.feature_extraction.text.TfidfVectorizer,
  'classifier': sklearn.naive_bayes.MultinomialNB,
  'stop_words': None,
  'clf__alpha': 0.1})

5. Result: Best Model

# 'ngram_range': (1, 2), 'stop_words': None, 'clf__alpha': 0.1}
vectorizer=CountVectorizer(ngram_range=(1,2))
vectors_train=vectorizer.fit_transform(x_train)
vectors_test=vectorizer.transform(x_test)
clf=MultinomialNB(alpha=0.1)
clf.fit(vectors_train,y_train)
evaluate(vectors_test,y_test,clf)
              precision    recall  f1-score   support

         neg       0.90      0.88      0.89       231
         pos       0.98      0.98      0.98      1260

    accuracy                           0.97      1491
   macro avg       0.94      0.93      0.94      1491
weighted avg       0.97      0.97      0.97      1491