Skip to content

Commit

Permalink
Add most_similar funciton
Browse files Browse the repository at this point in the history
  • Loading branch information
ed98b8ec97b0 committed Jun 4, 2018
1 parent a1c0881 commit 1a61340
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 25 deletions.
38 changes: 32 additions & 6 deletions src/Predictor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from gensim.models import KeyedVectors
from sklearn import neighbors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class Predictor:
Expand All @@ -17,6 +18,7 @@ def __init__(self, name):
self.model = KeyedVectors.load_word2vec_format(name, binary=True)
self.label = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


def top_rank(self, positive, negative = "", threshold = 0):
""" top_rank function takes lecture name then find top 10 most similar using gensim library. If there exists lecture whichs similarity is under threshhold, then that lecture is thrown.
Expand All @@ -41,12 +43,6 @@ def top_rank(self, positive, negative = "", threshold = 0):

return lecture_list

# def get_vector(self, positive, negative = ""):
# p = positive.split()
# if (negative != ""):
# n = negative.split()

# result =

def predict(self, target, lectures, scores):
"""predict function takes lecture vector list and lecture score list. These lists are associated idx. Using 2 lists, first calculate distance between target and neighbors. Then using scikit-learn library, find score.
Expand All @@ -69,3 +65,33 @@ def predict(self, target, lectures, scores):
score = clf.predict(target)[0]

return score


def get_vector(self, words):
sum_vector = np.zeros(100, dtype=float)
for word in words:
sum_vector = np.add(sum_vector, self.model.wv[word])

return sum_vector


def most_similar(self, target, lectures, threshold):
result = []

for lecture in lectures:
if (target[0] == lecture[0]):
continue

numerator = np.sum(np.multiply(target[1], lecture[1]))
denominator = np.sqrt(np.sum(np.multiply(target[1], target[1]))) * np.sqrt(np.sum(np.multiply(lecture[1], lecture[1])))
similarity = numerator / denominator

if similarity > threshold:
result.append([lecture[0], lecture[1], similarity])

result.sort(key=lambda x: x[1], reverse=True)

if (len(result) > 5):
result = result[0:5]

return result
43 changes: 24 additions & 19 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,50 @@
from Lecture2Vec import Lecture2Vec
from Predictor import Predictor
# from konlpy.tag import Twitter
# from konlpy.tag import Mecab
import argparse, os

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-b', "--build", type=bool, default=True, help="모델 제작 True/False")
parser.add_argument('-v', "--vocab", type=str, default="corpus", help="디렉토리 위치 corpus/<folder_name>")
parser.add_argument('-c', "--corpus", type=str, default="corpus", help="디렉토리 위치 corpus/<folder_name >")
# parser.add_argument('-t', "--token", type=str, default="X", help="토크나이저 사용 여부 True/False")
parser.add_argument('-d', "--dist", type=bool, default=True, help="vocab, corpus 구분 True/False")
parser.add_argument('-p', "--pred", type=bool, default=True, help="예측기 사용 여부 True/False")
parser.add_argument('-b', "--build", type=bool, default=False, help="모델 제작 True/False")
parser.add_argument('-v', "--vocab", type=str, default="", help="디렉토리 위치 corpus/<folder_name>")
parser.add_argument('-c', "--corpus", type=str, default="", help="디렉토리 위치 corpus/<folder_name >")
parser.add_argument('-d', "--dist", type=bool, default=False, help="vocab, corpus 구분 True/False")
parser.add_argument('-p', "--pred", type=bool, default=False, help="예측기 사용 여부 True/False")
parser.add_argument('-n', '--name', type=str, default='data/auto.bin', help="워드벡터 저장 위치 data/<binary file name>.bin")
parser.add_argument('-l', '--lecture', type=str, default="lectures.txt")
parser.add_argument('-l', '--lecture', type=str, default="data/lectures.txt")
parser.add_argument('-t', '--threshold', type=float, default=0.0)
args = parser.parse_args()

print("\nparser statement")
print("build:\t", args.build)
print("vocab:\t", args.vocab)
print("corpus:\t", args.corpus)
# print("token:\t", args.token)
print("dist:\t", args.dist)
print("pred:\t", args.pred)
print("name:\t", args.name)
print("t.hold:\t", args.threshold)
print("\n")

print("--- Result ---")
if (args.build == True):
print("Build Model")
model = Lecture2Vec()
model.build(vocab=args.vocab, corpus=args.corpus, distinct=args.dist, name=args.name)

print("\n\n")

if (args.pred == True):
print("Predict lecture")
pred = Predictor(name=args.name)

lecture_file = open(args.lecture, 'r')
lectures = lecture_file.readlines()
for l in lectures:
token = l.split()
top_10 = pred.top_rank(positive=token)
print(l, ":", top_10)
lecture_list = lecture_file.read().splitlines()
lectures = []
for l in lecture_list:
tokens = l.split()
vector = pred.get_vector(words=tokens)
lectures.append((l, vector))

most_similars = {}
for lecture in lectures:
most_similars[lecture[0]] = pred.most_similar(target=lecture, lectures=lectures, threshold=args.threshold)

for k in most_similars.keys():
print(k)
print(most_similars[k][0])
print("-" * 10)

0 comments on commit 1a61340

Please sign in to comment.