From 54471b8208f5bcc8e773b6dc240e6ff3aa3f7e6b Mon Sep 17 00:00:00 2001 From: ronakice Date: Sat, 6 Feb 2021 17:07:29 +0530 Subject: [PATCH 1/2] cleanup code, primarily DPR --- pygaggle/data/retrieval.py | 2 +- pygaggle/model/decode.py | 1 - pygaggle/model/evaluate.py | 9 ++++---- pygaggle/model/writer.py | 1 + pygaggle/reader/base.py | 26 ++++++++++++----------- pygaggle/run/evaluate_passage_reader.py | 11 +++++----- scripts/train_d2q.py | 28 +++++++++++++------------ 7 files changed, 42 insertions(+), 36 deletions(-) diff --git a/pygaggle/data/retrieval.py b/pygaggle/data/retrieval.py index b15c4ebc..f3c7b0d2 100644 --- a/pygaggle/data/retrieval.py +++ b/pygaggle/data/retrieval.py @@ -8,4 +8,4 @@ class RetrievalExample: query: Query texts: List[Text] - groundTruthAnswers: List[List[str]] + ground_truth_answers: List[List[str]] diff --git a/pygaggle/model/decode.py b/pygaggle/model/decode.py index f7bea22e..ce34878d 100644 --- a/pygaggle/model/decode.py +++ b/pygaggle/model/decode.py @@ -31,7 +31,6 @@ def greedy_decode(model: PreTrainedModel, decode_ids = torch.cat([decode_ids, next_token_logits.max(1)[1].unsqueeze(-1)], dim=-1) - past = outputs[1] if return_last_logits: return decode_ids, next_token_logits return decode_ids diff --git a/pygaggle/model/evaluate.py b/pygaggle/model/evaluate.py index 4ce2cb14..99a8d6b8 100644 --- a/pygaggle/model/evaluate.py +++ b/pygaggle/model/evaluate.py @@ -244,6 +244,7 @@ def evaluate_by_segments(self, metric.accumulate(doc_scores, example) return metrics + class ReaderEvaluator: """Class for evaluating a reader. Takes in a list of examples (query, texts, ground truth answers), @@ -268,15 +269,15 @@ def evaluate( for example in tqdm(examples): answers = self.reader.predict(example.query, example.texts) - bestAnswer = answers[0].text - groundTruthAnswers = example.groundTruthAnswers - em_hit = max([ReaderEvaluator.exact_match_score(bestAnswer, ga) for ga in groundTruthAnswers]) + best_answer = answers[0].text + ground_truth_answers = example.ground_truth_answers + em_hit = max([ReaderEvaluator.exact_match_score(best_answer, ga) for ga in ground_truth_answers]) ems.append(em_hit) if dpr_predictions is not None: dpr_predictions.append({ 'question': example.query.text, - 'prediction': bestAnswer, + 'prediction': best_answer, }) return ems diff --git a/pygaggle/model/writer.py b/pygaggle/model/writer.py index 3ea41b18..dfb6a6c8 100644 --- a/pygaggle/model/writer.py +++ b/pygaggle/model/writer.py @@ -31,6 +31,7 @@ def write(self, scores: List[float], example: RelevanceExample): for ct, (doc, score) in enumerate(doc_scores): self.write_line(f"{example.query.id}\t{doc.metadata['docid']}\t{ct+1}") + class TrecWriter(Writer): def write(self, scores: List[float], example: RelevanceExample): doc_scores = sorted(list(zip(example.documents, scores)), diff --git a/pygaggle/reader/base.py b/pygaggle/reader/base.py index a91d6909..f721d905 100644 --- a/pygaggle/reader/base.py +++ b/pygaggle/reader/base.py @@ -7,13 +7,15 @@ class Answer: """ Class representing an answer. - A answer contains the answer text itself and potentially other metadata. + An answer contains the answer text itself and potentially other metadata. Parameters ---------- text : str The answer text. metadata : Mapping[str, Any] Additional metadata and other annotations. + language: str + The language of the answer text. score : Optional[float] The score of the answer. ctx_score : Optional[float] @@ -55,16 +57,16 @@ def predict( texts: List[Text], ) -> List[Answer]: """ - Find answers from a list of texts with respect to a query. - Parameters - ---------- - query : Query - The query. - texts : List[Text] - The list of texts. - Returns - ------- - List[Answer] - Predicted list of answers. + Find answers from a list of texts with respect to a query. + Parameters + ---------- + query : Query + The query. + texts : List[Text] + The list of texts. + Returns + ------- + List[Answer] + Predicted list of answers. """ pass diff --git a/pygaggle/run/evaluate_passage_reader.py b/pygaggle/run/evaluate_passage_reader.py index 02e27159..780b768d 100644 --- a/pygaggle/run/evaluate_passage_reader.py +++ b/pygaggle/run/evaluate_passage_reader.py @@ -6,8 +6,6 @@ import numpy as np from pydantic import BaseModel -from transformers import (DPRReader, - DPRReaderTokenizer) from .args import ArgumentParserBuilder, opt from pygaggle.reader.base import Reader @@ -31,6 +29,7 @@ class PassageReadingEvaluationOptions(BaseModel): num_spans_per_passage: int device: str + def construct_dpr(options: PassageReadingEvaluationOptions) -> Reader: model = DensePassageRetrieverReader.get_model(options.model_name, options.device) tokenizer = DensePassageRetrieverReader.get_tokenizer(options.tokenizer_name) @@ -89,7 +88,7 @@ def main(): opt('--output-file', type=Path, default=None, - help='File to output predictions for each example; if no output file specified, this output will be discarded'), + help='File to output predictions for each example; if not specified, this output will be discarded'), opt('--device', type=str, default='cuda:0', @@ -133,8 +132,10 @@ def main(): examples.append( RetrievalExample( query=Query(text=item["question"]), - texts=list(map(lambda context: Text(text=context["text"].split('\n', 1)[1], title=context["text"].split('\n', 1)[0][1:-1]), item["contexts"]))[:options.use_top_k_passages], - groundTruthAnswers=item["answers"], + texts=list(map(lambda context: Text(text=context["text"].split('\n', 1)[1], + title=context["text"].split('\n', 1)[0][1:-1]), + item["contexts"]))[:options.use_top_k_passages], + ground_truth_answers=item["answers"], ) ) diff --git a/scripts/train_d2q.py b/scripts/train_d2q.py index 8e46d55e..afa71579 100644 --- a/scripts/train_d2q.py +++ b/scripts/train_d2q.py @@ -9,25 +9,27 @@ from torch.utils.data import Dataset import argparse + class TrainerDataset(Dataset): def __init__(self, path): - df = pd.read_csv(path, sep = "\t") + df = pd.read_csv(path, sep="\t") df = df.dropna() self.dataset = df self.tokenizer = T5Tokenizer.from_pretrained('t5-base') - + def __len__(self): return len(self.dataset) - + def __getitem__(self, idx): source = self.dataset.iloc[idx, 0] target = self.dataset.iloc[idx, 1] input_ids = self.tokenizer.encode(args.tag + ': ' + source, return_tensors='pt', - padding='max_length',truncation='longest_first', max_length=512)[0] + padding='max_length', truncation='longest_first', max_length=512)[0] label = self.tokenizer.encode(target, return_tensors='pt', padding='max_length', truncation='longest_first', max_length=64)[0] - return {'input_ids':input_ids, 'labels':label} - + return {'input_ids': input_ids, 'labels': label} + + parser = argparse.ArgumentParser(description='Train docTquery on more datasets') parser.add_argument('--pretrained_model_path', default='t5-base', help='pretrained model path') parser.add_argument('--tag', defaut='msmarco', help='tag for training data', type=str) @@ -38,24 +40,24 @@ def __getitem__(self, idx): parser.add_argument('--weight_decay', default=5e-5, type=float) parser.add_argument('--lr', default=3e-4, type=float) parser.add_argument('--gra_acc_steps', default=8, type=int) -args = parser.parse_args() +args = parser.parse_args() model = T5ForConditionalGeneration.from_pretrained(args.pretrained_model_path) train_dataset = TrainerDataset(args.train_data_path) training_args = TrainingArguments( - output_dir=args.output_path, - num_train_epochs=args.epoch, - per_device_train_batch_size=args.batch_size, + output_dir=args.output_path, + num_train_epochs=args.epoch, + per_device_train_batch_size=args.batch_size, weight_decay=args.weight_decay, learning_rate=args.lr, gradient_accumulation_steps=args.gra_acc_steps, - logging_dir='./logs', + logging_dir='./logs', ) trainer = Trainer( - model=model, - args=training_args, + model=model, + args=training_args, train_dataset=train_dataset ) From 87ec9f6d991d208ec58b2a0951ceab364f32b74b Mon Sep 17 00:00:00 2001 From: ronakice Date: Sat, 6 Feb 2021 17:18:28 +0530 Subject: [PATCH 2/2] flake8 compliant --- pygaggle/data/msmarco.py | 6 +++--- pygaggle/rerank/transformer.py | 9 ++++++--- pygaggle/run/evaluate_document_ranker.py | 6 ++---- pygaggle/run/evaluate_passage_reader.py | 2 ++ pygaggle/settings.py | 2 ++ 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pygaggle/data/msmarco.py b/pygaggle/data/msmarco.py index 34aa1dc8..58ecffd1 100644 --- a/pygaggle/data/msmarco.py +++ b/pygaggle/data/msmarco.py @@ -16,8 +16,8 @@ __all__ = ['MsMarcoExample', 'MsMarcoDataset'] -# MsMarcoExample represents a query along with its ranked and re-ranked -# candidates. + +# MsMarcoExample represents a query along with its ranked and re-ranked candidates. class MsMarcoExample(BaseModel): qid: str text: str @@ -41,7 +41,7 @@ def load_qrels(cls, path: str) -> DefaultDict[str, Set[str]]: return qrels # Load a run from the provided path. The run file contains mappings from - # a query id and a doc title to a rank. load_run returns a dictionary + # a query id and a doc title to a rank. load_run returns a dictionary # mapping query ids to lists of doc titles sorted by ascending rank. @classmethod def load_run(cls, path: str): diff --git a/pygaggle/rerank/transformer.py b/pygaggle/rerank/transformer.py index 05ecd56a..811bdf67 100644 --- a/pygaggle/rerank/transformer.py +++ b/pygaggle/rerank/transformer.py @@ -42,7 +42,8 @@ def get_model(pretrained_model_name_or_path: str = 'castorini/monot5-base-msmarc *args, device: str = None, **kwargs) -> T5ForConditionalGeneration: device = device or ('cuda' if torch.cuda.is_available() else 'cpu') device = torch.device(device) - return T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path, *args, **kwargs).to(device).eval() + return T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path, + *args, **kwargs).to(device).eval() @staticmethod def get_tokenizer(pretrained_model_name_or_path: str = 't5-base', @@ -86,7 +87,8 @@ def get_model(pretrained_model_name_or_path: str = 'castorini/duot5-base-msmarco *args, device: str = None, **kwargs) -> T5ForConditionalGeneration: device = device or ('cuda' if torch.cuda.is_available() else 'cpu') device = torch.device(device) - return T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path, *args, **kwargs).to(device).eval() + return T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path, + *args, **kwargs).to(device).eval() @staticmethod def get_tokenizer(pretrained_model_name_or_path: str = 't5-base', @@ -182,7 +184,8 @@ def get_model(pretrained_model_name_or_path: str = 'castorini/monobert-large-msm *args, device: str = None, **kwargs) -> AutoModelForSequenceClassification: device = device or ('cuda' if torch.cuda.is_available() else 'cpu') device = torch.device(device) - return AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *args, **kwargs).to(device).eval() + return AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, + *args, **kwargs).to(device).eval() @staticmethod def get_tokenizer(pretrained_model_name_or_path: str = 'bert-large-uncased', diff --git a/pygaggle/run/evaluate_document_ranker.py b/pygaggle/run/evaluate_document_ranker.py index 67296d4c..14be5fc2 100644 --- a/pygaggle/run/evaluate_document_ranker.py +++ b/pygaggle/run/evaluate_document_ranker.py @@ -4,9 +4,7 @@ from pydantic import BaseModel, validator from transformers import (AutoModel, - AutoTokenizer, - AutoModelForSequenceClassification, - T5ForConditionalGeneration) + AutoTokenizer) import torch from .args import ArgumentParserBuilder, opt @@ -20,7 +18,6 @@ from pygaggle.rerank.random import RandomReranker from pygaggle.rerank.similarity import CosineSimilarityMatrixProvider from pygaggle.model import (SimpleBatchTokenizer, - T5BatchTokenizer, RerankerEvaluator, metric_names, MsMarcoWriter) @@ -169,5 +166,6 @@ def main(): options.aggregate_method): logging.info(f'{metric.name:<{width}}{metric.value:.5}') + if __name__ == '__main__': main() diff --git a/pygaggle/run/evaluate_passage_reader.py b/pygaggle/run/evaluate_passage_reader.py index 780b768d..4d5cf8b4 100644 --- a/pygaggle/run/evaluate_passage_reader.py +++ b/pygaggle/run/evaluate_passage_reader.py @@ -40,6 +40,7 @@ def construct_dpr(options: PassageReadingEvaluationOptions) -> Reader: options.max_answer_length, options.num_spans_per_passage) + def display(ems): if len(ems) == 0: em = -1. @@ -47,6 +48,7 @@ def display(ems): em = np.mean(np.array(ems)) * 100. logging.info(f'Exact Match Accuracy: {em}') + def main(): apb = ArgumentParserBuilder() apb.add_opts( diff --git a/pygaggle/settings.py b/pygaggle/settings.py index f9e7e3f6..d693e10f 100644 --- a/pygaggle/settings.py +++ b/pygaggle/settings.py @@ -14,9 +14,11 @@ class Settings(BaseSettings): class MsMarcoSettings(Settings): pass + class TRECCovidSettings(Settings): pass + class Cord19Settings(Settings): # T5 model settings t5_model_dir: str = 'gs://neuralresearcher_data/covid/data/model_exp304'