-
Notifications
You must be signed in to change notification settings - Fork 39
/
tfidf.py
110 lines (97 loc) · 4.13 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import torch
from nltk.tokenize import word_tokenize
class TF_IDF:
"""
TF - Term Frequency: count of specific word in document / total no. of words in document
IDF - Inverse Document Frequency: Log ratio of (Total no. of documents / no. of documents containing words)
"""
def text_processing(self, X):
"""
text processing: We clean our text by removing special character and keeping them as lower case and each
line sentence is converted into list of words and then finding the total number of unqiue words in the all the
documents combined together.
:param X: List of documents
:return: Unique words (Vocabulary), all documents [[d_1], [d_2], ..[d_n]]
"""
documents = []
vocabulary = []
for document in X:
document_words = [word.lower() for word in word_tokenize(document) if word.isalpha()]
documents.append(document_words)
for word in document_words:
if word not in vocabulary:
vocabulary.append(word)
vocabulary = set(vocabulary)
return vocabulary, documents
def strtoint(self, vocabulary):
"""
:param vocabulary: all unique in the documents
:return: mapping words to integer such as {'the': 1}
"""
wordToInt = {}
for i, vocab in enumerate(vocabulary):
wordToInt[vocab] = i
return wordToInt
def vocab_frequency(self, vocabulary, documents):
"""
:param vocabulary: all unique in the documents
:param documents: all the documents
:return: Frequency of word in all the documents combined together
"""
word_frequency = {}
for word in vocabulary:
word_frequency[word] = 0
for document in documents:
if word in document:
word_frequency[word] += 1
return word_frequency
def tf(self, input_document, word):
"""
Calculating term_frequency
:param input_document: test document
:param word: each word in the test document
:return: tf value (refer the formula above)
"""
num_words = len(input_document)
word_frequency = len([token for token in input_document if token==word])
return word_frequency/num_words
def idf(self, word, word_frequency, documents):
"""
:param word: words of the test input document
:param word_frequency: word frequency w.r.t all the documents available.
:param documents: all the documents
:return: idf value
"""
try:
word_frequency = word_frequency[word] + 1
except:
word_frequency = 1
return torch.log(torch.scalar_tensor(len(documents))/word_frequency)
def fit_tranform(self, document, vocabulary, wordToInt, word_frequency, documents):
"""
:param document: test input document
:param vocabulary: all unique words
:param wordToInt: word to int mapping
:param word_frequency: each word frequency throughout all the documents
:param documents: all the documents
:return: tf_idf vector for test input document
"""
tfidf_vector = torch.zeros((len(vocabulary), ), dtype=torch.double)
for word in document:
tf = self.tf(document, word)
idf = self.idf(word, word_frequency, documents)
tfidf_values = tf * idf
tfidf_vector[wordToInt[word]] = tfidf_values
return tfidf_vector
if __name__ == '__main__':
vectors = []
documents = ['Hi, how are you?',
'What are you doing?',
'what is your name?',
'who are you?']
tfidf_vectorizer = TF_IDF()
vocabulary, processed_documents = tfidf_vectorizer.text_processing(documents)
wordToInt = tfidf_vectorizer.strtoint(vocabulary)
vocab_frequecy = tfidf_vectorizer.vocab_frequency(vocabulary, processed_documents)
_, new_document = tfidf_vectorizer.text_processing([documents[0]])
print(tfidf_vectorizer.fit_tranform(new_document[0],vocabulary, wordToInt, vocab_frequecy, documents))