Code for Python Email Spam Classifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
data = pd.read_csv('spam.csv')
print(data['v1'])
0 ham
1 ham
2 spam
3 ham
4 ham
...
5567 spam
5568 ham
5569 ham
5570 ham
5571 ham
Name: v1, Length: 5572, dtype: object
# Pre-data processing
columnsdrop = [col for col in data.columns if 'Unnamed' in col]
data.drop(columns=columnsdrop, inplace=True)
data['v1']=data['v1'].map({'ham':1,'spam':0}) # binary hot encoding
cv = CountVectorizer()
data=data.dropna()
print(data)
x_train,x_test,y_train,y_test=train_test_split(data['v2'],data['v1'],test_size = 0.3)
x_train = cv.fit_transform(x_train) # Bag of words
v1 v2
0 1 Go until jurong point, crazy.. Available only ...
1 1 Ok lar... Joking wif u oni...
2 0 Free entry in 2 a wkly comp to win FA Cup fina...
3 1 U dun say so early hor... U c already then say...
4 1 Nah I don't think he goes to usf, he lives aro...
... .. ...
5567 0 This is the 2nd time we have tried 2 contact u...
5568 1 Will Ì_ b going to esplanade fr home?
5569 1 Pity, * was in mood for that. So...any other s...
5570 1 The guy did some bitching but I acted like i'd...
5571 1 Rofl. Its true to its name
[5572 rows x 2 columns]
from sklearn.linear_model import LogisticRegression
MNB = LogisticRegression(C=1.0, solver='lbfgs', max_iter=100)
MNB.fit(x_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
from sklearn.metrics import r2_score
print(type(x_test))
predicted = MNB.predict(cv.transform(x_test).toarray())
p2=MNB.predict(x_train.toarray())
print(predicted)
print(r2_score(y_test,predicted))
print(r2_score(y_train,p2))
print(y_test)
print()
<class 'pandas.core.series.Series'>
[1 1 0 ... 1 1 1]
0.8271387955544068
0.9848356501090111
319 1
3471 1
1021 0
5299 1
1092 1
..
1636 1
4517 1
4431 1
3666 1
866 1
Name: v1, Length: 1672, dtype: int64
text=["click on my link thing.com"]
testingvalue=MNB.predict(cv.transform(text))
print(testingvalue)
[1]
coefficients = MNB.coef_ # Coefficients for each feature
intercept = MNB.intercept_
print(coefficients, intercept)
[[-0.42117837 -0.40637999 -0.00323472 ... 0.1307255 0.00748295
0.00334214]] [4.65871166]