Skip to content

Commit

Permalink
add codes
Browse files Browse the repository at this point in the history
  • Loading branch information
supercoderhawk committed Jan 16, 2020
1 parent 8e8d613 commit 9b3be83
Show file tree
Hide file tree
Showing 31 changed files with 1,145 additions and 253 deletions.
89 changes: 81 additions & 8 deletions scripts/predict_rerank.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,94 @@

DATA_DIR=${PWD}/data
RESULT_DIR=$DATA_DIR/submit_result
VALID_FILE=$DATA_DIR/validation.jsonl
VALID_FILE=$DATA_DIR/test_release.jsonl
MODEL_BASEDIR=$DATA_DIR/rerank

EPOCH=2
STEP=115000
export CUDA_VISIBLE_DEVICES=0

EPOCH=5
STEP=60000
TOPK=20
MODEL_BASENAME=only_TA_sample8
MODEL_BASENAME=only_TA_shuffle
RERANK_NAME='plm'

#EPOCH=2
#STEP=130000
#TOPK=20
#MODEL_BASENAME=only_TA_sample8_stop_scheduler
#RERANK_NAME='plm'

MODEL_DIR=$MODEL_BASEDIR/${MODEL_BASENAME}_$RERANK_NAME


python3 wsdm_digg/reranking/predict.py -eval_search_filename $RESULT_DIR/only_TA_release.jsonl \
-golden_filename $VALID_FILE \
-dest_filename $RESULT_DIR/${MODEL_BASENAME}_step_${STEP}_top${TOPK}_release.jsonl \
-model_path $MODEL_DIR/${MODEL_BASENAME}_epoch_${EPOCH}_step_$STEP.model \
-eval_batch_size 10 -topk $TOPK

EPOCH=5
STEP=60000
TOPK=200
MODEL_BASENAME=only_TA_shuffle
RERANK_NAME='plm'

#EPOCH=2
#STEP=130000
#TOPK=20
#MODEL_BASENAME=only_TA_sample8_stop_scheduler
#RERANK_NAME='plm'

MODEL_DIR=$MODEL_BASEDIR/${MODEL_BASENAME}_$RERANK_NAME

export CUDA_VISIBLE_DEVICES=1

python3 wsdm_digg/reranking/predict.py -eval_search_filename $RESULT_DIR/only_TA.jsonl \
python3 wsdm_digg/reranking/predict.py -eval_search_filename $RESULT_DIR/only_TA_release.jsonl \
-golden_filename $VALID_FILE \
-dest_filename $RESULT_DIR/${MODEL_BASENAME}_step_${STEP}_top${TOPK}.jsonl \
-dest_filename $RESULT_DIR/${MODEL_BASENAME}_step_${STEP}_top${TOPK}_release.jsonl \
-model_path $MODEL_DIR/${MODEL_BASENAME}_epoch_${EPOCH}_step_$STEP.model \
-eval_batch_size 10
-eval_batch_size 10 -topk $TOPK
#EPOCH=1
#STEP=80000
#TOPK=20
#MODEL_BASENAME=only_TA_sample8_stop_scheduler
#RERANK_NAME='plm'
#
#MODEL_DIR=$MODEL_BASEDIR/${MODEL_BASENAME}_$RERANK_NAME
#
#
#python3 wsdm_digg/reranking/predict.py -eval_search_filename $RESULT_DIR/only_TA.jsonl \
# -golden_filename $VALID_FILE \
# -dest_filename $RESULT_DIR/${MODEL_BASENAME}_step_${STEP}_top${TOPK}.jsonl \
# -model_path $MODEL_DIR/${MODEL_BASENAME}_epoch_${EPOCH}_step_$STEP.model \
# -eval_batch_size 10 -topk $TOPK
#
#
#EPOCH=1
#STEP=45000
#TOPK=20
#MODEL_BASENAME=only_TA_sample8_stop_scheduler
#RERANK_NAME='plm'
#
#MODEL_DIR=$MODEL_BASEDIR/${MODEL_BASENAME}_$RERANK_NAME
#
#
#python3 wsdm_digg/reranking/predict.py -eval_search_filename $RESULT_DIR/only_TA.jsonl \
# -golden_filename $VALID_FILE \
# -dest_filename $RESULT_DIR/${MODEL_BASENAME}_step_${STEP}_top${TOPK}.jsonl \
# -model_path $MODEL_DIR/${MODEL_BASENAME}_epoch_${EPOCH}_step_$STEP.model \
# -eval_batch_size 10 -topk $TOPK
#
#EPOCH=2
#STEP=130000
#TOPK=50
#MODEL_BASENAME=only_TA_sample8_stop_scheduler
#RERANK_NAME='plm'
#
#MODEL_DIR=$MODEL_BASEDIR/${MODEL_BASENAME}_$RERANK_NAME
#
#
#python3 wsdm_digg/reranking/predict.py -eval_search_filename $RESULT_DIR/only_TA.jsonl \
# -golden_filename $VALID_FILE \
# -dest_filename $RESULT_DIR/${MODEL_BASENAME}_step_${STEP}_top${TOPK}.jsonl \
# -model_path $MODEL_DIR/${MODEL_BASENAME}_epoch_${EPOCH}_step_$STEP.model \
# -eval_batch_size 10 -topk $TOPK
15 changes: 11 additions & 4 deletions scripts/prepare_rerank.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,21 @@

# build data
DATA_DIR=${PWD}/data
#SEARCH_FILENAME=$DATA_DIR/result/only_TA_train.jsonl
# SEARCH_FILENAME=$DATA_DIR/result/only_TA_train.jsonl
SEARCH_FILENAME=$DATA_DIR/result/only_TA.jsonl
#GOLDEN_FILENAME=$DATA_DIR/train.jsonl
# GOLDEN_FILENAME=$DATA_DIR/train.jsonl
GOLDEN_FILENAME=$DATA_DIR/test.jsonl
#DEST_FILENAME=$DATA_DIR/cite_textrank_top10_rerank_search_result_offset_10.jsonl
DEST_FILENAME=$DATA_DIR/test_vectorization.jsonl
DEST_FILENAME=$DATA_DIR/test_vectorization_offset20.jsonl
# DEST_FILENAME=$DATA_DIR/only_TA_sample10_aggregate.jsonl

python wsdm_digg/data_process/rerank_data_builder.py -search_filename $SEARCH_FILENAME \
-golden_filename $GOLDEN_FILENAME -dest_filename $DEST_FILENAME \
-select_strategy 'search_result_offset' -offset 100 -sample_count 1
-select_strategy 'search_result_offset' -offset 20 -sample_count 1


# DEST_FILENAME=$DATA_DIR/only_TA_sample8.jsonl.bak

# python wsdm_digg/data_process/rerank_data_builder.py -search_filename $SEARCH_FILENAME \
# -golden_filename $GOLDEN_FILENAME -dest_filename $DEST_FILENAME \
# -select_strategy 'search_result_offset' -offset 2 -sample_count 8
57 changes: 40 additions & 17 deletions scripts/run_vectorization.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,30 @@
#!/bin/bash

DATA_DIR=${PWD}/data
PLM_MODEL_NAME='scibert-scivocab-uncased'
# PLM_MODEL_NAME='scibert-scivocab-uncased'

# vectorize paper

export CUDA_VISIBLE_DEVICES=1

MODEL_PATH=$DATA_DIR/vectorization/dssm_loss_cls/dssm_loss_cls_epoch_1_step_50000.model
SRC_FILENAME=$DATA_DIR/candidate_paper_for_wsdm2020.jsonl
DEST_FILENAME=$DATA_DIR/candidate_paper_dssm_loss_cls_vector.txt
BATCH_SIZE=20

python3 wsdm_digg/vectorization/predict.py -model_path $MODEL_PATH \
-src_filename $SRC_FILENAME -dest_filename $DEST_FILENAME \
-batch_size $BATCH_SIZE -data_type doc

# vectorize validation data
MODEL_PATH=$DATA_DIR/vectorization/dssm_loss_cls/dssm_loss_cls_epoch_1_step_50000.model
SRC_FILENAME=$DATA_DIR/test.jsonl
DEST_FILENAME=$DATA_DIR/test_dssm_loss_cls_vector.txt
BATCH_SIZE=20

python3 wsdm_digg/vectorization/predict.py -model_path $MODEL_PATH \
-src_filename $SRC_FILENAME -dest_filename $DEST_FILENAME \
-batch_size $BATCH_SIZE -data_type 'query' -query_field 'description_text'

# vectorize paper

Expand All @@ -13,20 +36,20 @@ PLM_MODEL_NAME='scibert-scivocab-uncased'
#python3 wsdm_digg/vectorization/plm_vectorization.py -plm_model_name $PLM_MODEL_NAME \
# -src_filename $SRC_FILENAME -dest_filename $DEST_FILENAME

export CUDA_VISIBLE_DEVICES=0
# export CUDA_VISIBLE_DEVICES=0

# vectorize validation data
SRC_FILENAME=$DATA_DIR/test.jsonl
DEST_FILENAME=$DATA_DIR/test_desc_vector.txt
#echo $SRC_FILENAME
#echo $DEST_FILENAME
python3 wsdm_digg/vectorization/plm_vectorization.py -plm_model_name $PLM_MODEL_NAME \
-src_filename $SRC_FILENAME -dest_filename $DEST_FILENAME -mode 'query' \
-query_field 'description_text'

#SRC_FILENAME=$DATA_DIR/test.jsonl
DEST_FILENAME=$DATA_DIR/test_cite_vector.txt

python3 wsdm_digg/vectorization/plm_vectorization.py -plm_model_name $PLM_MODEL_NAME \
-src_filename $SRC_FILENAME -dest_filename $DEST_FILENAME -mode 'query' \
-query_field 'cites_text'
# # vectorize validation data
# SRC_FILENAME=$DATA_DIR/test.jsonl
# DEST_FILENAME=$DATA_DIR/test_desc_vector.txt
# #echo $SRC_FILENAME
# #echo $DEST_FILENAME
# python3 wsdm_digg/vectorization/plm_vectorization.py -plm_model_name $PLM_MODEL_NAME \
# -src_filename $SRC_FILENAME -dest_filename $DEST_FILENAME -mode 'query' \
# -query_field 'description_text'

# #SRC_FILENAME=$DATA_DIR/test.jsonl
# DEST_FILENAME=$DATA_DIR/test_cite_vector.txt

# python3 wsdm_digg/vectorization/plm_vectorization.py -plm_model_name $PLM_MODEL_NAME \
# -src_filename $SRC_FILENAME -dest_filename $DEST_FILENAME -mode 'query' \
# -query_field 'cites_text'
8 changes: 5 additions & 3 deletions scripts/train_rerank.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/bin/bash

EXP_NAME='only_TA_sample8_stop_scheduler'
EXP_NAME='only_TA_sample8_new'
DATA_DIR=${PWD}/data
#TRAINING_FILENAME=$DATA_DIR/cite_textrank_top10_rerank_random.jsonl
#TRAINING_FILENAME=$DATA_DIR/cite_textrank_top10_rerank_search_result.jsonl
#TRAINING_FILENAME=$DATA_DIR/only_TA_search_result.jsonl
TRAINING_FILENAME=$DATA_DIR/only_TA_sample8.jsonl
TRAINING_FILENAME=$DATA_DIR/only_TA_sample8.jsonl.bak
#TRAINING_FILENAME=$DATA_DIR/cite_textrank_top10_rerank_search_result_false_top.jsonl
TEST_FILENAME=$DATA_DIR/test.jsonl

Expand All @@ -16,6 +16,7 @@ PLM_MODEL_NAME='scibert-scivocab-uncased'
RERANK_MODEL_NAME='plm'
#RERANK_MODEL_NAME='knrm'
#RERANK_MODEL_NAME='conv-knrm'
#RERANK_MODEL_NAME='mp'

DEST_DIR=$DATA_DIR/rerank/${EXP_NAME}_${RERANK_MODEL_NAME}/
#DEST_DIR=$DATA_DIR/rerank/${EXP_NAME}_${RERANK_MODEL_NAME}_context/
Expand All @@ -30,4 +31,5 @@ python3 wsdm_digg/reranking/trainer.py -exp_name $EXP_NAME \
-mean_list 0.9 0.7 0.5 0.3 0.1 -0.1 -0.3 -0.5 -0.7 -0.9 \
-stddev_list 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 \
-batch_size 4 -window_size_list 1 2 3 -gradient_accumulate_step 4 \
-scheduler_lr -scheduler_step 5000 -scheduler_gamma 0.5
-scheduler_lr -scheduler_step 10000 -scheduler_gamma 0.5 \
# -separate_learning_rate
12 changes: 8 additions & 4 deletions scripts/train_vectorization.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
#!/bin/bash

EXP_NAME='baseline'
EXP_NAME='dssm_loss'
DATA_DIR=${PWD}/data
TRAIN_FILENAME=$DATA_DIR/only_TA_sample8.jsonl
TRAIN_FILENAME=$DATA_DIR/only_TA_sample10_aggregate.jsonl
TEST_FILENAME=$DATA_DIR/test_vectorization.jsonl
DEST_DIR=$DATA_DIR/vectorization/${EXP_NAME}
DEST_DIR=$DATA_DIR/vectorization/${EXP_NAME}/

PLM_MODEL_NAME='scibert-scivocab-uncased'

export CUDA_VISIBLE_DEVICES=0

python3 wsdm_digg/vectorization/trainer.py -train_filename $TRAIN_FILENAME \
-test_filename $TEST_FILENAME -exp_name $EXP_NAME -dest_base_dir $DEST_DIR \
-mode train -plm_model_name $PLM_MODEL_NAME -embed_mode USE
-mode train -plm_model_name $PLM_MODEL_NAME -embed_mode USE -batch_size 1 \
-scheduler_lr -gradient_accumulate_step 8 -query_field 'description_text' \
-learning_rate 1e-5
22 changes: 22 additions & 0 deletions tests/reranking/test_dataloader.py
Original file line number Diff line number Diff line change
@@ -1 +1,23 @@
# -*- coding: UTF-8 -*-
from munch import Munch
from wsdm_digg.constants import *
from wsdm_digg.reranking.dataloader import RerankDataLoader


def test_data_loader():
args = Munch({
'batch_size': 4,
'max_len': 512
})
mode_name = 'bert-base-uncased'
tokenizer = MODEL_DICT[mode_name]['tokenizer_class'].from_pretrained(mode_name)
loader = RerankDataLoader(DATA_DIR + 'baseline_rerank.jsonl',
tokenizer,
args, 'eval')
step = 0
for _ in range(10):
for batch in loader:
if step == 0:
print(batch)
print(step)
step += 1
80 changes: 59 additions & 21 deletions wsdm_digg/benchmark/benchmarker.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# -*- coding: UTF-8 -*-
import time
from multiprocessing import Pool
import argparse
# from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from pysenal import *
from wsdm_digg.search.search import KeywordSearch
from wsdm_digg.constants import DATA_DIR, RESULT_DIR, SUBMIT_DIR
from wsdm_digg.benchmark.evaluator import Evaluator
from wsdm_digg.utils import result_format
from wsdm_digg.elasticsearch.data import get_paper

searcher = KeywordSearch()

Expand All @@ -16,39 +20,59 @@ def __init__(self,
batch_size=100,
parallel_count=20,
top_n=20,
is_submit=False):
is_submit=False,
is_final_submit=False):
self.src_filename = src_filename
if is_submit:
self.src_filename = DATA_DIR + 'validation.jsonl'
self.dest_filename = SUBMIT_DIR + dest_filename
top_n = 3
else:
self.dest_filename = RESULT_DIR + dest_filename
self.dest_csv_filename = os.path.splitext(dest_filename)[0] + '.tsv'
if is_final_submit:
self.src_filename = DATA_DIR + 'test_release.jsonl'
self.dest_filename = SUBMIT_DIR + dest_filename
self.dest_csv_filename = os.path.splitext(self.dest_filename)[0] + '.csv'
self.src_count = int(os.popen('wc -l {}'.format(self.src_filename)).read().split()[0])
self.searched_id = self.get_searched_doc()
self.batch_size = batch_size
self.parallel_count = parallel_count
self.top_n = top_n
self.is_submit = is_submit
self.is_final_submit = is_final_submit

def batch_runner(self):
start = time.time()
pool = Pool(self.parallel_count)
for doc_chunk in get_chunk(read_jsonline_lazy(self.src_filename), self.batch_size):
ret = pool.map(self.single_query, doc_chunk)
ret = [r for r in ret if r['docs']]
append_jsonlines(self.dest_filename, ret)
pool = ThreadPool(self.parallel_count)

while True:
for doc_chunk in self.get_input_batch():
ret = pool.map(self.single_query, doc_chunk)
ret = [item for item in ret if item]
append_jsonlines(self.dest_filename, ret)
self.searched_id = self.get_searched_doc()
duration = time.time() - start
print('time consumed {}min {}sec'.format(duration // 60, duration % 60))
if len(self.searched_id) == self.src_count:
break

duration = time.time() - start
print('time consumed {}min {}sec'.format(duration // 60, duration % 60))
if self.is_submit:
self.result_format(self.dest_filename, self.dest_csv_filename)
if self.is_submit or self.is_final_submit:
result_format(self.dest_filename, self.dest_csv_filename)
if self.src_filename.endswith('test.jsonl'):
eval_ret = Evaluator(self.src_filename).evaluation_map(self.dest_filename, top_n=3)
print(eval_ret)

def single_query(self, doc):
ret = searcher.search(doc['description_text'], doc['cites_text'], self.top_n)
return {'description_id': doc['description_id'], **ret}
try:
# keywords = get_paper(doc['paper_id'])['keywords']
# print(keywords)
# ret = searcher.search(doc['description_text'], doc['cites_text'], self.top_n, keywords)
ret = searcher.search(doc['description_text'], doc['cites_text'], self.top_n)
return {'description_id': doc['description_id'], **ret}
except Exception as e:
# print(e)
return None

def get_input_batch(self):
batch = []
Expand All @@ -69,11 +93,25 @@ def get_searched_doc(self):
searched_doc_id.append(doc['description_id'])
return searched_doc_id

def result_format(self, src_filename, dest_filename):
for item in read_jsonline_lazy(src_filename):
desc_id = item['description_id']
paper_ids = item['docs'][:3]
if not paper_ids:
raise ValueError('result is empty')
line = desc_id + '\t' + '\t'.join(paper_ids)
append_line(dest_filename, line)

def main():
parser = argparse.ArgumentParser()
parser.add_argument('-src_filename', type=str, default=DATA_DIR + 'test.jsonl')
parser.add_argument('-dest_filename', type=str, required=True, )
parser.add_argument('-batch_size', type=int, default=100)
parser.add_argument('-parallel_count', type=int, default=20)
parser.add_argument('-top_n', type=int, default=20)
parser.add_argument('-is_submit', action='store_true')
args = parser.parse_args()

Benchmarker(dest_filename=args.dest_filename,
src_filename=args.src_filename,
batch_size=args.batch_size,
parallel_count=args.parallel_count,
top_n=args.top_n,
is_submit=args.is_submit
).batch_runner()


if __name__ == '__main__':
main()
Loading

0 comments on commit 9b3be83

Please sign in to comment.