Merge pull request #53 from notoookay/update-env-config

typos and missing dependency
AkariAsai · Mar 19, 2024 · 141dd48 · 141dd48
2 parents 12afe0b + 1052b89
commit 141dd48
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -111,7 +111,7 @@ If the script does not work, you can download the data from [google drive](https
 Then, you can run the script under `retrieval_lm`. We tested the script using on 1 RTX 6000 with 24GB and 100G RAM (but should be runnable with much smaller RAM).
 
 ```py
-from passage_retriever import Retriever
+from passage_retrieval import Retriever
 retriever = Retriever({})
 retriever.setup_retriever_demo("facebook/contriever-msmarco", "enwiki_2020_intro_only/enwiki_2020_dec_intro_only.jsonl", "enwiki_2020_intro_only/enwiki_dec_2020_contriever_intro/*",  n_docs=5, save_or_load_index=False)
 retrieved_documents = retriever.search_document_demo(query_3, 5)

diff --git a/environment.yml b/environment.yml
@@ -52,7 +52,7 @@ dependencies:
       - exceptiongroup==1.2.0
       - fastapi==0.105.0
       - filelock==3.13.1
-      - flash-attn==2.3.6
+      # - flash-attn==2.3.6
       - frozenlist==1.4.1
       - fsspec==2023.10.0
       - google-auth==2.25.2

diff --git a/retrieval_lm/script_finetune_13b.sh b/retrieval_lm/script_finetune_13b.sh
@@ -18,7 +18,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
     --use_flash_attn \
     --tokenizer_name meta-llama/Llama-2-13b-hf \
     --use_slow_tokenizer \
-    --train_file full_output_1005.jsonl \
+    --train_file train.jsonl \
     --max_seq_length 1536 \
     --preprocessing_num_workers 16 \
     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \

diff --git a/retrieval_lm/script_finetune_7b.sh b/retrieval_lm/script_finetune_7b.sh
@@ -18,7 +18,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
     --use_flash_attn \
     --tokenizer_name meta-llama/Llama-2-7b-hf \
     --use_slow_tokenizer \
-    --train_file full_output_1005.jsonl \
+    --train_file train.jsonl \
     --max_seq_length 2048 \
     --preprocessing_num_workers 16 \
     --per_device_train_batch_size $BATCH_SIZE_PER_GPU \

diff --git a/setup.sh b/setup.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+conda env create -f environment.yml
+
+conda activate selfrag
+
+# Install flash-attn package
+pip3 install flash-attn==2.3.6
+
+# Install faiss-gpu as it is not included in the `environment.yml`
+conda install -c conda-forge faiss-gpu