first commit

HansiZeng · Oct 26, 2023 · 2f80814 · 2f80814
commit 2f80814
Show file tree

Hide file tree

Showing 77 changed files with 11,669 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,164 @@
+# customized
+experiments-full-t5seq-aq/
+wandb/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/READEME.md b/READEME.md
@@ -0,0 +1,4 @@
+# Package installation
+pip install -r requirement.txt 
+pip install torch==1.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+conda install -c conda-forge faiss-cpu
diff --git a/full_16_1024_scripts/full_evaluate_t5seq_aq_encoder.sh b/full_16_1024_scripts/full_evaluate_t5seq_aq_encoder.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+task=all_aq_pipline
+data_root_dir=/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full
+collection_path=$data_root_dir/full_collection/
+q_collection_paths='["/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/TREC_DL_2019/queries_2019/","/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/TREC_DL_2020/queries_2020/","/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/dev_queries/"]'
+eval_qrel_path='["/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/dev_qrel.json","/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/TREC_DL_2019/qrel.json","/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/TREC_DL_2019/qrel_binary.json","/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/TREC_DL_2020/qrel.json","/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/TREC_DL_2020/qrel_binary.json"]'
+experiment_dir=experiments-full-16-1024-t5seq-aq
+
+if [ $task = all_aq_pipline ]; then 
+    echo "task: $task"
+
+    model_dir="/home/ec2-user/quic-efs/user/hansizeng/work/t5_pretrainer/t5_pretrainer/$experiment_dir/t5_docid_gen_encoder_1"
+    pretrained_path=$model_dir/checkpoint
+    index_dir=$model_dir/aq_index
+    mmap_dir=$model_dir/mmap
+    out_dir=$model_dir/aq_out
+
+    M=16
+    nbits=10
+    K=$((2 ** $nbits))
+    echo M: $M nbits: $nbits K: $K
+    echo $model_dir
+
+    #python -m torch.distributed.launch --nproc_per_node=8 -m t5_pretrainer.evaluate \
+    #--pretrained_path=$pretrained_path \
+    #--index_dir=$mmap_dir \
+    #--task=mmap \
+    #--encoder_type=t5seq_pretrain_encoder \
+    #--collection_path=$collection_path
+
+    #python -m t5_pretrainer.evaluate \
+    #--task=mmap_2 \
+    #--index_dir=$mmap_dir \
+    #--mmap_dir=$mmap_dir
+
+    python -m t5_pretrainer.evaluate \
+    --task=aq_index \
+    --num_subvectors_for_pq=$M \
+    --codebook_bits=$nbits \
+    --index_dir=$index_dir \
+    --mmap_dir=$mmap_dir
+
+    python -m t5_pretrainer.evaluate \
+    --task=aq_evaluate \
+    --pretrained_path=$pretrained_path \
+    --index_dir=$index_dir \
+    --out_dir=$out_dir \
+    --q_collection_paths=$q_collection_paths \
+    --eval_qrel_path=$eval_qrel_path  \
+    --mmap_dir=$mmap_dir
+
+    python t5_pretrainer/aq_preprocess/create_customized_smtid_file.py \
+    --model_dir=$model_dir \
+    --M=$M \
+    --bits=$nbits
+
+    python -m t5_pretrainer.aq_preprocess.change_customized_embed_layer \
+    --model_dir=$model_dir \
+    --K=$K
+elif [ $task = aq_to_flat_index_search_evaluate ]; then 
+    echo "task: $task"
+    data_dir="/home/ec2-user/quic-efs/user/hansizeng/work/t5_pretrainer/t5_pretrainer/$experiment_dir/t5_docid_gen_encoder_1"
+    docid_to_smtid_path=$data_dir/aq_smtid/docid_to_smtid.json
+
+    model_dir="/home/ec2-user/quic-efs/user/hansizeng/work/t5_pretrainer/t5_pretrainer/$experiment_dir/t5_docid_gen_encoder_1"
+    pretrained_path=$model_dir/no_share_checkpoint
+    index_dir=$model_dir/aq_flat_index
+    out_dir=$model_dir/aq_flat_out
+
+    python -m t5_pretrainer.evaluate --pretrained_path=$pretrained_path \
+        --collection_path=$collection_path/raw.tsv \
+        --docid_to_smtid_path=$docid_to_smtid_path \
+        --index_dir=$index_dir \
+        --out_dir=$out_dir  \
+        --task=$task \
+        --eval_qrel_path=$eval_qrel_path
+elif [ $task == "retrieve_train_queries" ]; then 
+    echo "run retrieve_train_queries task"
+
+    # the model_dir should be changed every time
+    model_dir="/home/ec2-user/quic-efs/user/hansizeng/work/t5_pretrainer/t5_pretrainer/$experiment_dir/t5_docid_gen_encoder_1"
+    index_dir=$model_dir/index
+    out_dir=$model_dir/out/
+    pretrained_path=$model_dir/checkpoint
+
+    python -m t5_pretrainer.evaluate \
+    --task=retrieve \
+    --pretrained_path=$pretrained_path \
+    --index_dir=$index_dir \
+    --out_dir=$out_dir  \
+    --q_collection_paths='["/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/all_train_queries/train_queries"]' \
+    --topk=100 \
+    --encoder_type=t5seq_pretrain_encoder
+elif [ $task = all_pipline ]; then 
+    echo "task: $task"
+
+    model_dir="/home/ec2-user/quic-efs/user/hansizeng/work/t5_pretrainer/t5_pretrainer/$experiment_dir/t5_docid_gen_encoder_1"
+    pretrained_path=$model_dir/checkpoint
+    index_dir=$model_dir/index
+    out_dir=$model_dir/out
+
+    python -m torch.distributed.launch --nproc_per_node=8 -m t5_pretrainer.evaluate \
+    --pretrained_path=$pretrained_path \
+    --index_dir=$index_dir \
+    --out_dir=$out_dir \
+    --task=index \
+    --encoder_type=t5seq_pretrain_encoder \
+    --collection_path=$collection_path
+
+    python -m t5_pretrainer.evaluate \
+    --task=index_2 \
+    --index_dir=$index_dir 
+
+    python -m t5_pretrainer.evaluate \
+    --task=retrieve \
+    --pretrained_path=$pretrained_path \
+    --index_dir=$index_dir \
+    --out_dir=$out_dir \
+    --encoder_type=t5seq_pretrain_encoder \
+    --q_collection_paths=$q_collection_paths \
+    --eval_qrel_path=$eval_qrel_path
+elif [ $task = "t5seq_aq_get_qid_to_smtid_rankdata" ]; then 
+    export CUDA_VISIBLE_DEVICES=0,1,2,3
+    echo "task: $task"
+    data_dir="/home/ec2-user/quic-efs/user/hansizeng/work/t5_pretrainer/t5_pretrainer/$experiment_dir/t5_docid_gen_encoder_1"
+    docid_to_smtid_path=$data_dir/aq_smtid/docid_to_smtid.json 
+
+    model_dir=/home/ec2-user/quic-efs/user/hansizeng/work/t5_pretrainer/t5_pretrainer/$experiment_dir/t5seq_aq_encoder_seq2seq_1
+    pretrained_path=$model_dir/checkpoint
+    train_query_dir="/home/ec2-user/quic-efs/user/hansizeng/work/data/msmarco-full/all_train_queries/train_queries/"
+
+    # need to remove later
+    max_new_token=8
+
+    out_dir=$model_dir/sub_smtid_"${max_new_token}"_out/
+    python -m torch.distributed.launch --nproc_per_node=4 -m t5_pretrainer.evaluate \
+        --pretrained_path=$pretrained_path \
+        --out_dir=$out_dir \
+        --task=$task \
+        --docid_to_smtid_path=$docid_to_smtid_path \
+        --topk=100 \
+        --batch_size=4 \
+        --train_query_dir=$train_query_dir \
+        --max_new_token=$max_new_token
+
+    python -m t5_pretrainer.evaluate \
+        --task="$task"_2 \
+        --out_dir=$out_dir 
+elif [ $task = "t5seq_aq_retrieve_docids_use_sub_smtid" ]; then 
+    export CUDA_VISIBLE_DEVICES=0,1,2,3
+    echo "task: $task"
+    data_dir="/home/ec2-user/quic-efs/user/hansizeng/work/t5_pretrainer/t5_pretrainer/$experiment_dir/t5_docid_gen_encoder_1"
+    docid_to_smtid_path=$data_dir/aq_smtid/docid_to_smtid.json 
+
+    # need to modify for a new experiment
+    max_new_token=8
+    model_dir=/home/ec2-user/quic-efs/user/hansizeng/work/t5_pretrainer/t5_pretrainer/$experiment_dir/t5seq_aq_encoder_seq2seq_1_lng_knp_self_mnt_8_dcy_2
+    pretrained_path=$model_dir/checkpoint
+    out_dir=$model_dir/out_docid_from_sub_"$max_new_token"_top1000/
+
+    python -m torch.distributed.launch --nproc_per_node=4 -m t5_pretrainer.evaluate \
+        --pretrained_path=$pretrained_path \
+        --out_dir=$out_dir \
+        --task=t5seq_aq_retrieve_docids \
+        --docid_to_smtid_path=$docid_to_smtid_path \
+        --q_collection_paths=$q_collection_paths \
+        --batch_size=1 \
+        --max_new_token_for_docid=$max_new_token \
+        --topk=1000
+
+    python -m t5_pretrainer.evaluate \
+        --task=t5seq_aq_retrieve_docids_2 \
+        --out_dir=$out_dir \
+        --q_collection_paths=$q_collection_paths \
+        --eval_qrel_path=$eval_qrel_path
+else 
+echo "Error: Unknown task."
+exit 1
+fi