Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Sherryuu authored Apr 12, 2021
1 parent 1ec0df0 commit 43e9dc1
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 0 deletions.
9 changes: 9 additions & 0 deletions Data/Robust04/a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import subprocess
trec_eval_f = '../../HGCF/bin/trec_eval'
VALIDATION_METRIC = 'ndcg_cut.20'
#runf = 'run.robust04.bm25.topics.robust04.txt'
for i in range(1,6):
runf = 'test_run/f{}.test.run'.format(str(i))
qrelf = 'qrels'
output = subprocess.check_output([trec_eval_f, '-m', VALIDATION_METRIC, qrelf, runf]).decode().rstrip()
print(output)
52 changes: 52 additions & 0 deletions Data/Robust04/build_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
with open('clean.documents.txt', 'r') as f1, open('clean.queries.txt', 'r') as f2:
docs = f1.readlines()
qrls = f2.readlines()


remap_docs, remap_words, remap_qrls = {}, {}, {}
new_docs, new_qrls = [], []


for line in docs:
did, sent = line.split('\t')
words = sent.split()
n = ''
if did not in remap_docs:
remap_docs[did] = len(remap_docs)
n += str(remap_docs[did]) + '\t'
for w in words:
if w not in remap_words:
remap_words[w] = len(remap_words)
n += str(remap_words[w]) + ' '
new_docs.append(n)


for line in qrls:
qid, sent = line.split('\t')
words = sent.split()
n = ''
if qid not in remap_qrls:
remap_qrls[qid] = len(remap_qrls)
n += str(remap_qrls[qid]) + '\t'
for w in words:
if w not in remap_words:
remap_words[w] = len(remap_words)
n += str(remap_words[w]) + ' '
new_qrls.append(n)


with open('map.documents.txt', 'w') as f1, open('map.queries.txt', 'w') as f2:
for line in new_docs:
f1.writelines(line + '\n')
for line in new_qrls:
f2.writelines(line + '\n')


with open('doc_dict.txt', 'w') as f1, open('qrl_dict.txt', 'w') as f2, \
open('word_dict.txt', 'w') as f3:
for key in remap_docs.keys():
f1.writelines(str(key) + '\t' + str(remap_docs[key]) + '\n')
for key in remap_qrls.keys():
f2.writelines(str(key) + '\t' + str(remap_qrls[key]) + '\n')
for key in remap_words.keys():
f3.writelines(str(key) + '\t' + str(remap_words[key]) + '\n')
53 changes: 53 additions & 0 deletions Data/Robust04/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import nltk
from nltk.corpus import stopwords
import re, string
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()
nums = [str(i) for i in range (10)]
with open('queries.tsv', 'r') as f:
x = f.readlines()

def h_num(word):
for i in word:
if i in nums:
return True
return False
stop = set(stopwords.words('english'))

temp = []
freq = {}
id_list = set()
for line in x:
sid = line.split('\t')[1]
if sid in id_list:
continue
id_list.add(sid)
sentence = line.split('\t')[-1]
sentence = sentence.lower()
sentence = sentence.replace('-',' ')
sentence = sentence.replace('--',' ')
#cleanr = re.compile('<.*?>')
#sentence = re.sub(cleanr, ' ', sentence)
#sentence = re.sub('[?|!|\'|"|#]', '', sentence)
#sentence = re.sub('[.|,|\\\\|/]', '', sentence)
sentence = sentence.translate(str.maketrans('','', string.punctuation))
words = [word for word in sentence.split() if word not in stop and not h_num(word)]
temp.append((sid, words))
for word in words:
if word in freq:
freq[word] += 1
else:
freq[word] = 1

sent = []
for (sid, row) in temp:
sequ = ''
for word in row:
if freq[word] >= 5:
sequ = sequ + ' ' + stemmer.lemmatize(word)
sent.append((sid, sequ))

with open('clean.queries.txt', 'w') as f:
for (sid, s) in sent:
f.writelines(sid + '\t' + s +'\n')

0 comments on commit 43e9dc1

Please sign in to comment.