forked from milaan9/Clustering_Algorithms_from_Scratch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Naive_Bayes.py
132 lines (109 loc) · 4.07 KB
/
Naive_Bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#================================================================================================================
#----------------------------------------------------------------------------------------------------------------
# NAIVE BAYES
#----------------------------------------------------------------------------------------------------------------
#================================================================================================================
#Using Naive Bayes to classify emails as spam and non-spam
import math
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import nltk
import re
#for plotting
plt.style.use('ggplot')
class CustomNB:
def __init__(self):
pass
def create_freq_table(self, texts, labels=None):
#create the dataframe
ft = pd.DataFrame([])
#iterate through them emails
for index, trm in enumerate(texts):
vocabulary = trm #all the words of the email
#amount of times each word occurs in the dictionary
freq_dict = pd.Series({ v : trm.count(v) for v in vocabulary})
if labels!=None:
freq_dict['CLASS'] = labels[index]
ft = ft.append(freq_dict, ignore_index=True)
ft = ft.fillna(0)
return ft
def train(self, freq):
fq = freq.iloc[:,1:]
labels = fq.iloc[:, 0].values
vocabulary = list(fq.columns.values)
s, ns = pd.DataFrame([]), pd.DataFrame([])
for index, rw in fq.iterrows():
if labels[index] == 1:
s = s.append(rw)
else:
ns = ns.append(rw)
swc = sum([word for word in s.sum()])
nswc = sum([word for word in ns.sum()])
#Create the dicts for storing the model
non_spam_model, spam_model = {}, {}
alpha = 0.5
for word in vocabulary:
spam_occurances = s[word].sum()
non_spam_occurances = ns[word].sum()
#Now, the crux of the algo, the bayesian probablity
bayesian_probablity_for_spam = ( spam_occurances + alpha ) / ( swc )
bayesian_probablity_for_non_spam = ( non_spam_occurances + alpha ) / ( nswc )
#Update the model
non_spam_model[word], spam_model[word] = bayesian_probablity_for_spam , bayesian_probablity_for_non_spam
return spam_model, non_spam_model
def predict(self, text, spam_model,non_spam_model):
text = [x for x in text.replace('\n', ' ')]
text = [''.join(text).split()]
f_table = self.create_freq_table(text)
vocabulary = f_table.columns.values
spam_prob = 0
non_spam_prob = 0
for word in vocabulary:
if word in spam_model:
spam_prob += spam_model[word]
if word in non_spam_model:
non_spam_prob += non_spam_model[word]
print(spam_prob, non_spam_prob)
return (spam_prob / non_spam_prob)
def main():
# 0 means spam, 1 means not spam
emails = {"0": ["Dear friend, win 1000$ cash right now!!.", " Webcams Day & Night - All LIVE - Webcams Contest",
"Congratulations, you've won a free car!"],
"1": ["Hey, Can you redo the presentation and send it on by Friday?", "Thank you for the documents, I will revert as soon as possible",
"Open the Pod bay doors, HAL." , " I'm sorry, Dave. I'm afraid I can't do that."]}
t= []
labl = []
for k in emails:
for mail in emails[k]:
tokens = [word.lower() for sent in nltk.sent_tokenize(mail) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
t.append(' '.join(filtered_tokens))
labl.append(k)
for i in range(len(t)):
t[i] = t[i].split()
nb = CustomNB()
freq_table = nb.create_freq_table(t, labl)
sp, nsp = nb.train(freq_table)
#ip = "Hey, Can you redo the presentation and send it on by Friday?"
#ip = "Webcams Day & Night - All LIVE - Webcams Contest"
ip = "Open the Pod bay doors, HAL."
#ip = "I'm sorry, Dave. I'm afraid I can't do that."
propr_ip = []
for ch in nltk.word_tokenize(ip):
if re.search('[a-zA-Z]', ch):
propr_ip.append(ch.lower())
propr_ip = ' '.join(propr_ip)
x = nb.predict(propr_ip, sp, nsp)
print(x)
if x < 0.5:
print("spam")
else:
print("not spam")
if __name__ == "__main__":
main()