Merge pull request #8 from Muennighoff/sgptcepy

Add CE script
Muennighoff · Aug 11, 2022 · 6e122e8 · 6e122e8
2 parents 127ae25 + 81f2387
commit 6e122e8
Show file tree

Hide file tree

Showing 5 changed files with 556 additions and 4 deletions.
diff --git a/biencoder/nli_msmarco/README.md b/biencoder/nli_msmarco/README.md
@@ -352,3 +352,5 @@ On BEIR going from a batch size of 16 to 256 yielded a 1% average performance in
 
 
 For simple usage of wandb, we prepend `WANDB_BASE_URL=https://api.wandb.ai WANDB_API_KEY=YOUR_API_KEY WANDB_ENTITY=YOUR_ENTITY_NAME WANDB_PROJECT=YOUR_PROJECT` to all commands.
+
+When no internet connection is available also prepend `WANDB_MODE="dryrun"` and upload results afterwards using `WANDB_BASE_URL=https://api.wandb.ai WANDB_API_KEY=YOUR_API_KEY WANDB_ENTITY=YOUR_ENTITY_NAME WANDB_PROJECT=YOUR_PROJECT wandb sync --sync-all`
diff --git a/biencoder/nli_msmarco/scripts/train_nobitfit.slurm b/biencoder/nli_msmarco/scripts/train_nobitfit.slurm
@@ -0,0 +1,27 @@
+#!/bin/bash
+#SBATCH --job-name=muennighoffs
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --reservation=hug
+#SBATCH --qos=qos_gpu-gc             # up to 100h
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=64           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 100:00:00             # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0
+conda activate muennighoffs
+echo "START TIME: $(date)"
+
+cd /gpfsscratch/rech/six/commun/experiments/muennighoff/sgpt/biencoder/nli_msmarco/sentence-transformers
+
+
+WANDB_MODE="dryrun" WANDB_BASE_URL=https://api.wandb.ai WANDB_API_KEY=YOUR_KEY WANDB_ENTITY=muennighoff WANDB_PROJECT="sgpt" CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file /gpfsscratch/rech/six/commun/experiments/muennighoff/sgpt/accelerate_config_fp32 examples/training/nli/training_nli_v2.py --model_name EleutherAI/gpt-neo-2.7B --train_batch_size 128 --lr 32e-5 --pooling weightedmean --wandb --wandbwatchlog gradients --gradcache --chunksize 16
+
+echo "DONE"
diff --git a/crossencoder/beir/README.md b/crossencoder/beir/README.md
@@ -4,11 +4,13 @@
 
 Notebook overview:
 
-- `crossencoder_beir_bm25`: Creating BM25 results on BEIR 
-- `crossencoder_beir_sgpt`: Creating SGPT reranking results based on BM25 results (Note: If you do not want to rerun BM25, you can download the BM25 results from the datasets provided, see the Downloads section for more information.)
-- `crossencoder_bioasq_bm25`: Parsing of the BioASQ dataset & running it with BM25 - This dataset is 21GB in size & requires some specific processing, hence the separate notebook. Run it on a large RAM instance to avoid BM25 running out of memory.
-- `crossencoder_openai`: Scoring the OpenAI semantic search endpoint on BEIR (This is not the embedding endpoint, but (most likely) a Cross Encoder based endpoint)
+- `crossencoder_beir_bm25.ipynb`: Creating BM25 results on BEIR 
+- `crossencoder_beir_sgpt.ipynb`: Creating SGPT reranking results based on BM25 results (Note: If you do not want to rerun BM25, you can download the BM25 results from the datasets provided, see the Downloads section for more information.)
+- `crossencoder_bioasq_bm25.ipynb`: Parsing of the BioASQ dataset & running it with BM25 - This dataset is 21GB in size & requires some specific processing, hence the separate notebook. Run it on a large RAM instance to avoid BM25 running out of memory.
+- `crossencoder_openai.ipynb`: Scoring the OpenAI semantic search endpoint on BEIR (This is not the embedding endpoint, but (most likely) a Cross Encoder based endpoint)
 - `openai_search_endpoint_functionality.py` OpenAI Search Endpoint mechanism (Released to the public on 06/2022 (After SGPT paper release))
+- `sgptce.py` Provides a non-notebook variant of `crossencoder_beir_sgpt.ipynb`
+- `scripts` Slurm scripts for `sgptce.py`
 - `../../other/sgpt_utils`: Various utils for compouting re-ranking scores & graphs for the SGPT paper (The code is partly duplicated in `crossencoder_beir_sgpt`)
 
 ### Downloads

diff --git a/crossencoder/beir/scripts/run_sgptce.slurm b/crossencoder/beir/scripts/run_sgptce.slurm
@@ -0,0 +1,46 @@
+#!/bin/bash
+#SBATCH --job-name=run-array-a100    # job name
+#SBATCH --ntasks=1                   # number of MP tasks
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:1
+#SBATCH --cpus-per-task=8         # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --time 100:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --reservation=hug
+#SBATCH --constraint=a100
+#SBATCH --partition=gpu_p5
+#SBATCH --qos=qos_gpu-gc             # up to 100h
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-prod
+conda activate muennighoffmtb
+
+echo "START TIME: $(date)"
+
+cd /gpfswork/rech/six/commun/code/tr13f-6B3-ml-t0/sgptce/
+
+#"fever"
+#"climate-fever"
+#"nq"
+#"hotpotqa"
+DATASETS=(
+"trec-covid"
+"webis-touche2020"
+"nfcorpus"
+"scifact"
+"fiqa"
+"dbpedia-entity"
+"quora"
+"arguana"
+"scidocs"
+)
+
+dataset=${DATASETS[$SLURM_ARRAY_TASK_ID]}
+
+python sgptce.py \
+	--batchsize 128 \
+	--dataset $dataset \
+	--modelpath /gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3/bloom-6b3
Original file line number	Diff line number	Diff line change
Expand Up		@@ -352,3 +352,5 @@ On BEIR going from a batch size of 16 to 256 yielded a 1% average performance in


		For simple usage of wandb, we prepend `WANDB_BASE_URL=https://api.wandb.ai WANDB_API_KEY=YOUR_API_KEY WANDB_ENTITY=YOUR_ENTITY_NAME WANDB_PROJECT=YOUR_PROJECT` to all commands.

		When no internet connection is available also prepend `WANDB_MODE="dryrun"` and upload results afterwards using `WANDB_BASE_URL=https://api.wandb.ai WANDB_API_KEY=YOUR_API_KEY WANDB_ENTITY=YOUR_ENTITY_NAME WANDB_PROJECT=YOUR_PROJECT wandb sync --sync-all`