Add files via upload

CRIPAC-DIG · Apr 12, 2021 · 43e9dc1 · 43e9dc1
1 parent 1ec0df0
commit 43e9dc1
Show file tree

Hide file tree

Showing 3 changed files with 114 additions and 0 deletions.
diff --git a/Data/Robust04/a.py b/Data/Robust04/a.py
@@ -0,0 +1,9 @@
+import subprocess
+trec_eval_f = '../../HGCF/bin/trec_eval'
+VALIDATION_METRIC = 'ndcg_cut.20'
+#runf = 'run.robust04.bm25.topics.robust04.txt'
+for i in range(1,6):
+	runf = 'test_run/f{}.test.run'.format(str(i))
+	qrelf = 'qrels'
+	output = subprocess.check_output([trec_eval_f, '-m', VALIDATION_METRIC, qrelf, runf]).decode().rstrip()
+	print(output)
diff --git a/Data/Robust04/build_dict.py b/Data/Robust04/build_dict.py
@@ -0,0 +1,52 @@
+with open('clean.documents.txt', 'r') as f1, open('clean.queries.txt', 'r') as f2:
+  docs = f1.readlines()
+  qrls = f2.readlines()
+
+
+remap_docs, remap_words, remap_qrls = {}, {}, {}
+new_docs, new_qrls = [], []
+
+
+for line in docs:
+  did, sent = line.split('\t')
+  words = sent.split()
+  n = ''
+  if did not in remap_docs:
+    remap_docs[did] = len(remap_docs)
+  n += str(remap_docs[did]) + '\t'
+  for w in words:
+    if w not in remap_words:
+      remap_words[w] = len(remap_words)
+    n += str(remap_words[w]) + ' '
+  new_docs.append(n)
+
+
+for line in qrls:
+  qid, sent = line.split('\t')
+  words = sent.split()
+  n = ''
+  if qid not in remap_qrls:
+    remap_qrls[qid] = len(remap_qrls)
+  n += str(remap_qrls[qid]) + '\t'
+  for w in words:
+    if w not in remap_words:
+      remap_words[w] = len(remap_words)
+    n += str(remap_words[w]) + ' '
+  new_qrls.append(n)
+
+
+with open('map.documents.txt', 'w') as f1, open('map.queries.txt', 'w') as f2:
+  for line in new_docs:
+    f1.writelines(line + '\n')
+  for line in new_qrls:
+    f2.writelines(line + '\n')
+
+
+with open('doc_dict.txt', 'w') as f1, open('qrl_dict.txt', 'w') as f2, \
+     open('word_dict.txt', 'w') as f3:
+  for key in remap_docs.keys():
+    f1.writelines(str(key) + '\t' + str(remap_docs[key]) + '\n')
+  for key in remap_qrls.keys():
+    f2.writelines(str(key) + '\t' + str(remap_qrls[key]) + '\n')
+  for key in remap_words.keys():
+    f3.writelines(str(key) + '\t' + str(remap_words[key]) + '\n')
diff --git a/Data/Robust04/preprocess.py b/Data/Robust04/preprocess.py
@@ -0,0 +1,53 @@
+import nltk
+from nltk.corpus import stopwords
+import re, string
+from nltk.stem import WordNetLemmatizer
+
+stemmer = WordNetLemmatizer()
+nums = [str(i) for i in range (10)]
+with open('queries.tsv', 'r') as f:
+  x = f.readlines()
+
+def h_num(word):
+  for i in word:
+    if i in nums:
+      return True
+  return False
+stop = set(stopwords.words('english'))
+
+temp = []
+freq = {}
+id_list = set()
+for line in x:
+  sid = line.split('\t')[1]
+  if sid in id_list:
+    continue
+  id_list.add(sid)
+  sentence = line.split('\t')[-1]
+  sentence = sentence.lower()
+  sentence = sentence.replace('-',' ')
+  sentence = sentence.replace('--',' ')
+  #cleanr = re.compile('<.*?>')
+  #sentence = re.sub(cleanr, ' ', sentence)
+  #sentence = re.sub('[?|!|\'|"|#]', '', sentence)
+  #sentence = re.sub('[.|,|\\\\|/]', '', sentence)
+  sentence = sentence.translate(str.maketrans('','', string.punctuation))
+  words = [word for word in sentence.split() if word not in stop and not h_num(word)]
+  temp.append((sid, words))
+  for word in words:
+    if word in freq:
+      freq[word] += 1
+    else:
+      freq[word] = 1
+
+sent = []
+for (sid, row) in temp:
+  sequ = ''
+  for word in row:
+    if freq[word] >= 5:
+      sequ = sequ + ' ' + stemmer.lemmatize(word)
+  sent.append((sid, sequ))
+
+with open('clean.queries.txt', 'w') as f:
+  for (sid, s) in sent:
+    f.writelines(sid + '\t' + s +'\n')