We will use the SMS spam collection dataset from: https://archive.ics.uci.edu/ml/machine-learning-databases/00228/
The dataset provides 5572 sms classified as spam and non-spam (ham)
We are going to try two different classifiers, which take different approaches:
# import libraries
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('SMSSpamCollection', header = None, sep='\t')
df.head()
import nltk
from string import punctuation
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword = stopwords.words("english")
def to_lower(text):
"""
Converting text to lower case as in, converting "Hello" to "hello" or "HELLO" to "hello".
"""
return ' '.join([w.lower() for w in nltk.word_tokenize(text)])
processed_sms = []
for i in range(len(df)):
#process sms into lower cases
lower = to_lower(df.iloc[i][1])
#remove numbers in the sms
remove_numbers = ''.join(c for c in lower if not c.isdigit())
#remove punctuations in the sms
remove_punc = ''.join(c for c in remove_numbers if c not in punctuation)
#remove stopwords
word_tokens = nltk.word_tokenize(remove_punc)
remove_stopwords = [word for word in word_tokens if word not in stopword]
processed_sms.append(remove_stopwords)
df['processed sms']=processed_sms
df.head()
#shuffle the data
df_shuffled = df.iloc[np.random.permutation(len(df))]
df_shuffled = df_shuffled.reset_index(drop=True)
df_shuffled.shape
#split 2/3 into training data amd 1/3 test data
training_data = df_shuffled.iloc[0:3715].copy()
test_data = df_shuffled.iloc[3715:].copy()
#create input binary variables on the existence of specific words
vocabulary = list(set(training_data['processed sms'].sum()))
word_check = pd.DataFrame(np.zeros((3715, len(vocabulary))),columns = vocabulary)
word_check.head()
for i in range(len(training_data)):
for word in training_data['processed sms'][i]:
if word in vocabulary:
word_check[word][i] = 1
word_check.head()
training_x = word_check.copy()
#set output variable as binary as well. spam = 1, ham = 0
training_y = []
for i in range(len(training_data)):
if training_data.iloc[i,0]=='spam':
training_y.append(1)
else:
training_y.append(0)
#fit Bernoulli naive bayes classifier using training data
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(training_x, training_y)
test_data = test_data.reset_index(drop=True)
#create binary input variables for test data using the same vocabulary as training data
test_word_check = pd.DataFrame(np.zeros((1857, len(vocabulary))),columns = vocabulary)
for i in range(len(test_data)):
for word in test_data['processed sms'][i]:
if word in vocabulary:
test_word_check[word][i] = 1
test_word_check
test_x = test_word_check.copy()
#create binary output variable
test_y = []
for i in range(len(test_data)):
if test_data.iloc[i,0]=='spam':
test_y.append(1)
else:
test_y.append(0)
clf.predict(test_x)
#assess the performance on test data
clf.score(test_x, test_y)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#calculate accuracy score and classification matrics for Bernoulli Naive Bayes classifier on test data
test_predict_bin = clf.predict(test_x)
print ('Accuracy Score: ', accuracy_score(y_true=test_y, y_pred = test_predict_bin))
print(classification_report(y_true = test_y, y_pred = test_predict_bin))
#show confusion matrix for Bernoulli Naive Bayes classifier on test data
cm = sklearn.metrics.confusion_matrix(test_y,test_predict_bin)
sns.set(font_scale=1.5)
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax=ax, fmt='g', cbar=False)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show()
#Fit Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb = mnb.fit(training_x, training_y)
#assess the performance on test data
mnb.score(test_x, test_y)
#calculate accuracy score and classification matrics for Multinomial Naive Bayes classifier on test data
test_predict_mul = mnb.predict(test_x)
print ('Accuracy Score: ', accuracy_score(y_true=test_y, y_pred = test_predict_mul))
print(classification_report(y_true = test_y, y_pred = test_predict_mul))
#show confusion matrix for Multinomial Naive Bayes classifier on test data
cm = sklearn.metrics.confusion_matrix(test_y,test_predict_mul)
sns.set(font_scale=1.5)
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax=ax, fmt='g', cbar=False)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Ham', 'Spam'])
ax.yaxis.set_ticklabels(['Ham', 'Spam'])
plt.show()
df[0].value_counts()
print('Proportion of spam sms: ', 747/(4825+747))
We see that both models reported a high accuracy on the test set, being 0.9752 for the Bernoulli model and 0.9865 for the Multinomial model.
Additionally, checking the confussion matrix we see that there is a higher tendency on both models to report false negatives, instead of false positives, which could be expected due to the unbalanced dataset we are working with