Skip to content

Commit

Permalink
update scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
supercoderhawk committed Jan 17, 2020
1 parent d4b182b commit c3dc80c
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 23 deletions.
4 changes: 3 additions & 1 deletion scripts/prepare_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ python3 wsdm_digg/data_process/raw_data_formatter.py
python3 wsdm_digg/data_process/data_split.py

# build elasticsearch indexing
python3 wsdm_digg/elasticsearch/indexer.py
python3 wsdm_digg/elasticsearch/indexer.py

echo 'index building done!'
24 changes: 12 additions & 12 deletions scripts/prepare_rerank.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@

# build data
DATA_DIR=${PWD}/data
# SEARCH_FILENAME=$DATA_DIR/result/only_TA_train.jsonl
SEARCH_FILENAME=$DATA_DIR/result/only_TA.jsonl
# GOLDEN_FILENAME=$DATA_DIR/train.jsonl
GOLDEN_FILENAME=$DATA_DIR/test.jsonl
#DEST_FILENAME=$DATA_DIR/cite_textrank_top10_rerank_search_result_offset_10.jsonl
DEST_FILENAME=$DATA_DIR/test_vectorization_offset20.jsonl
# DEST_FILENAME=$DATA_DIR/only_TA_sample10_aggregate.jsonl
SEARCH_FILENAME=$DATA_DIR/result/only_TA_train.jsonl
GOLDEN_FILENAME=$DATA_DIR/train.jsonl
DEST_FILENAME=$DATA_DIR/only_TA_search_result.jsonl

# execute BM25 search to generate candidates for building training data
python3 wsdm_digg/benchmark/benchmarker.py -src_filename $DATA_DIR/train.jsonl \
-dest_filename $SEARCH_FILENAME

# build training data for reranking
python wsdm_digg/data_process/rerank_data_builder.py -search_filename $SEARCH_FILENAME \
-golden_filename $GOLDEN_FILENAME -dest_filename $DEST_FILENAME \
-select_strategy 'search_result_offset' -offset 20 -sample_count 1

DEST_FILENAME=$DATA_DIR/only_TA_sample8.jsonl

# DEST_FILENAME=$DATA_DIR/only_TA_sample8.jsonl.bak

# python wsdm_digg/data_process/rerank_data_builder.py -search_filename $SEARCH_FILENAME \
# -golden_filename $GOLDEN_FILENAME -dest_filename $DEST_FILENAME \
# -select_strategy 'search_result_offset' -offset 2 -sample_count 8
python wsdm_digg/data_process/rerank_data_builder.py -search_filename $SEARCH_FILENAME \
-golden_filename $GOLDEN_FILENAME -dest_filename $DEST_FILENAME \
-select_strategy 'search_result_offset' -offset 2 -sample_count 8
17 changes: 10 additions & 7 deletions scripts/run_end2end.sh
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
#!/bin/bash

DATA_DIR=${PWD}/data/
ES_RESULT_FILE=$DATA_DIR/validation_es_result.jsonl
FINAL_RESULT_FILENAME=$DATA_DIR/validation_final_result.jsonl
MODEL_PATH=$DATA_DIR/models/rerank_model.model
TOPK=20
DATA_DIR=${PWD}/data
GOLDEN_FILENAME=$DATA_DIR/test_release.jsonl
ES_RESULT_FILE=$DATA_DIR/test_es_result.jsonl
FINAL_RESULT_FILENAME=$DATA_DIR/test_final_result.jsonl
MODEL_PATH=$DATA_DIR/rerank_model.model
TOPK=50

export CUDA_VISIBLE_DEVICES=1

# run elasticsearch (BM25)
python3 wsdm_digg/benchmark/benchmarker.py -src_filename $DATA_DIR/validation.jsonl \
python3 wsdm_digg/benchmark/benchmarker.py -src_filename $DATA_DIR/test_release.jsonl \
-dest_filename $ES_RESULT_FILE

# run rerank by bert
python3 wsdm_digg/reranking/predict.py -eval_search_filename $ES_RESULT_FILE \
-golden_filename $VALID_FILE \
-golden_filename $GOLDEN_FILENAME \
-dest_filename $RESULT_DIR/$FINAL_RESULT_FILENAME \
-model_path $MODEL_PATH \
-eval_batch_size 10 -topk $TOPK
7 changes: 4 additions & 3 deletions scripts/train_rerank.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/bin/bash

EXP_NAME='only_TA_sample8_new'
EXP_NAME='only_TA'
DATA_DIR=${PWD}/data
#TRAINING_FILENAME=$DATA_DIR/cite_textrank_top10_rerank_random.jsonl
#TRAINING_FILENAME=$DATA_DIR/cite_textrank_top10_rerank_search_result.jsonl
#TRAINING_FILENAME=$DATA_DIR/only_TA_search_result.jsonl
TRAINING_FILENAME=$DATA_DIR/only_TA_sample8.jsonl.bak
TRAINING_FILENAME=$DATA_DIR/only_TA_search_result.jsonl
#TRAINING_FILENAME=$DATA_DIR/only_TA.jsonl
#TRAINING_FILENAME=$DATA_DIR/cite_textrank_top10_rerank_search_result_false_top.jsonl
TEST_FILENAME=$DATA_DIR/test.jsonl

Expand All @@ -17,6 +17,7 @@ RERANK_MODEL_NAME='plm'
#RERANK_MODEL_NAME='knrm'
#RERANK_MODEL_NAME='conv-knrm'
#RERANK_MODEL_NAME='mp'
#RERANK_MODEL_NAME='pairwise'

DEST_DIR=$DATA_DIR/rerank/${EXP_NAME}_${RERANK_MODEL_NAME}/
#DEST_DIR=$DATA_DIR/rerank/${EXP_NAME}_${RERANK_MODEL_NAME}_context/
Expand Down

0 comments on commit c3dc80c

Please sign in to comment.