Email Classifier

Code for Python Email Spam Classifier

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
data = pd.read_csv('spam.csv')
print(data['v1'])

      ham
      ham
     spam
      ham
      ham
        ... 
  spam
   ham
   ham
   ham
   ham
Name: v1, Length: 5572, dtype: object

# Pre-data processing
columnsdrop = [col for col in data.columns if 'Unnamed' in col]
data.drop(columns=columnsdrop, inplace=True)
data['v1']=data['v1'].map({'ham':1,'spam':0}) # binary hot encoding
cv = CountVectorizer()
data=data.dropna()
print(data)
x_train,x_test,y_train,y_test=train_test_split(data['v2'],data['v1'],test_size = 0.3)
x_train = cv.fit_transform(x_train) # Bag of words

      v1                                                 v2
    1  Go until jurong point, crazy.. Available only ...
    1                      Ok lar... Joking wif u oni...
    0  Free entry in 2 a wkly comp to win FA Cup fina...
    1  U dun say so early hor... U c already then say...
    1  Nah I don't think he goes to usf, he lives aro...
...   ..                                                ...
 0  This is the 2nd time we have tried 2 contact u...
 1              Will Ì_ b going to esplanade fr home?
 1  Pity, * was in mood for that. So...any other s...
 1  The guy did some bitching but I acted like i'd...
 1                         Rofl. Its true to its name

[5572 rows x 2 columns]

from sklearn.linear_model import LogisticRegression
MNB = LogisticRegression(C=1.0, solver='lbfgs', max_iter=100)

MNB.fit(x_train, y_train)

LogisticRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

from sklearn.metrics import r2_score
print(type(x_test))
predicted = MNB.predict(cv.transform(x_test).toarray()) 
p2=MNB.predict(x_train.toarray()) 
print(predicted)
print(r2_score(y_test,predicted))
print(r2_score(y_train,p2))
print(y_test)
print()

<class 'pandas.core.series.Series'>
[1 1 0 ... 1 1 1]
8271387955544068
9848356501090111
   1
  1
  0
  1
  1
       ..
  1
  1
  1
  1
   1
Name: v1, Length: 1672, dtype: int64

text=["click on my link thing.com"]
testingvalue=MNB.predict(cv.transform(text))
print(testingvalue)

[1]

coefficients = MNB.coef_  # Coefficients for each feature
intercept = MNB.intercept_ 
print(coefficients, intercept)

[[-0.42117837 -0.40637999 -0.00323472 ...  0.1307255   0.00748295
   0.00334214]] [4.65871166]

Email Classifier • 5 min read

Description

Code for Python Email Spam Classifier