Skip to content

Commit

Permalink
[only TA]
Browse files Browse the repository at this point in the history
  • Loading branch information
supercoderhawk committed Jan 8, 2020
1 parent 915cc4b commit 867591e
Showing 1 changed file with 10 additions and 9 deletions.
19 changes: 10 additions & 9 deletions wsdm_digg/search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@


class KeywordSearch(object):
field_weight = {'title': 4,
'abstract': 6,
field_weight = {'title': 2,
'abstract': 3,
'TA': 10,
'keywords': 1}
es_special_char_regex = re.compile(r'(?P<PUNC>[+-=&|!(){}\[\]^"~*?:/])')
cites_person_name_regex = re.compile(r'(?:(?:[A-Z][a-z]{1,20} ?){1,3}(?:, )?){1,5}et al')
Expand All @@ -24,7 +25,7 @@ def __init__(self):
def search(self, text, cites_text, top_n, paper_keywords=None):
if not text:
raise ValueError('input search text is empty')
# doc = self.nlp(text)

noun_chunks = self.extractor.get_noun_chunk(cites_text)
noun_chunks = self.format_terms(noun_chunks)
noun_chunks = ['"' + noun + '"' for noun in noun_chunks]
Expand All @@ -34,24 +35,24 @@ def search(self, text, cites_text, top_n, paper_keywords=None):
if not query_terms:
query_terms = self.extractor.get_query_words(text)
query_terms = self.format_terms(query_terms)
# keywords = self.extractor.textrank(doc, 10,
# window_size=2,
# edge_weighting='binary')

cites_keywords = self.extractor.textrank(cites_text, 15, window_size=2,
edge_weighting='binary')

query_terms = query_terms + cites_keywords

important_keywords = self.format_terms(self.extractor.get_query_words(text))
query_terms = query_terms + important_keywords
# print(noun_chunks)

query_terms = [term for term in query_terms if term.strip()]
if not query_terms:
query_terms = self.format_terms([text])
# paper_keywords = self.format_terms(paper_keywords)
# paper_keywords = ['"' + k + '"' if ' ' in k else k for k in paper_keywords]
query_dict = {'title': query_terms,
'abstract': query_terms}
query_dict = {
# 'title': query_terms,
# 'abstract': query_terms,
'TA': query_terms}
es_query_obj = self.build_es_query_string_object(query_dict, top_n)
ret = requests.post(self.search_url, json=es_query_obj, headers=self.headers)
searched_paper_id = []
Expand Down

0 comments on commit 867591e

Please sign in to comment.