-
Notifications
You must be signed in to change notification settings - Fork 44
/
info.py
99 lines (82 loc) · 2.69 KB
/
info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from __future__ import division
from math import log, exp
from operator import mul
from collections import Counter
import os
import pickle
class MyDict(dict):
def __getitem__(self, key):
if key in self:
return self.get(key)
return 0
pos = MyDict()
neg = MyDict()
features = set()
totals = [0, 0]
delchars = ''.join(c for c in map(chr, range(128)) if not c.isalnum())
# CDATA_FILE = "countdata.pickle"
FDATA_FILE = "reduceddata.pickle"
def negate_sequence(text):
"""
Detects negations and transforms negated words into "not_" form.
"""
negation = False
delims = "?.,!:;"
result = []
words = text.split()
prev = None
pprev = None
for word in words:
# stripped = word.strip(delchars)
stripped = word.strip(delims).lower()
negated = "not_" + stripped if negation else stripped
result.append(negated)
if prev:
bigram = prev + " " + negated
result.append(bigram)
if pprev:
trigram = pprev + " " + bigram
result.append(trigram)
pprev = prev
prev = negated
if any(neg in word for neg in ["not", "n't", "no"]):
negation = not negation
if any(c in word for c in delims):
negation = False
return result
def classify2(text):
"""
For classification from pretrained data
"""
words = set(word for word in negate_sequence(text) if word in pos or word in neg)
if (len(words) == 0): return True, 0
# Probability that word occurs in pos documents
pos_prob = sum(log((pos[word] + 1) / (2 * totals[0])) for word in words)
neg_prob = sum(log((neg[word] + 1) / (2 * totals[1])) for word in words)
return (pos_prob > neg_prob, abs(pos_prob - neg_prob))
def classify_demo(text):
words = set(word for word in negate_sequence(text) if word in pos or word in neg)
if (len(words) == 0):
print "No features to compare on"
return True
pprob, nprob = 0, 0
for word in words:
pp = log((pos[word] + 1) / (2 * totals[0]))
np = log((neg[word] + 1) / (2 * totals[1]))
print "%15s %.9f %.9f" % (word, exp(pp), exp(np))
pprob += pp
nprob += np
print ("Positive" if pprob > nprob else "Negative"), "log-diff = %.9f" % abs(pprob - nprob)
def feature_selection_trials():
"""
Select top k features. Vary k and plot data
"""
global pos, neg, totals, features
retrain = False
if not retrain and os.path.isfile(FDATA_FILE):
pos, neg, totals = pickle.load(open(FDATA_FILE))
return
if __name__ == '__main__':
feature_selection_trials()
def setup():
feature_selection_trials()