From 2ed2acc62e445e3e887c6cf853ccc0b0b3b57534 Mon Sep 17 00:00:00 2001 From: qguo96 Date: Tue, 29 Sep 2020 17:30:36 -0400 Subject: [PATCH] Add more prebuilt indexes features (#235) + prebuild indexes for BERTserini --- README.md | 2 + pyserini/index/_base.py | 23 +++++++++++ pyserini/indexInfo.py | 74 ++++++++++++++++++++++++++++++++++++ pyserini/search/_searcher.py | 19 ++++++++- pyserini/util.py | 51 +++++++++++++------------ 5 files changed, 143 insertions(+), 26 deletions(-) create mode 100644 pyserini/indexInfo.py diff --git a/README.md b/README.md index 617db004f..bfc8cc35f 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,8 @@ It currently supports: + robust04 (TREC Disks 4 & 5) + ms-marco-passage (MS MARCO Passage) + ms-marco-doc (MS MARCO Doc) ++ enwiki-paragraphs (English Wikipedia) ++ zhwiki-paragraphs (Chinese Wikipedia) ## How Do I Fetch a Document? diff --git a/pyserini/index/_base.py b/pyserini/index/_base.py index 6e33b0066..83591c03b 100644 --- a/pyserini/index/_base.py +++ b/pyserini/index/_base.py @@ -27,6 +27,7 @@ from ..analysis import get_lucene_analyzer, JAnalyzer, JAnalyzerUtils from ..pyclass import autoclass, JString from ..search import Document +from pyserini.util import download_prebuilt_index, get_indexes_info logger = logging.getLogger(__name__) @@ -150,6 +151,28 @@ def __init__(self, index_dir): self.object = JIndexReader() self.reader = self.object.getReader(JString(index_dir)) + @classmethod + def from_prebuilt_index(cls, prebuilt_index_name: str): + """Build an index reader from the prebuilt index, download the index if necessary. + + Parameters + ---------- + prebuilt_index_name : str + Prebuilt index name. + + Returns + ------- + IndexReader + Index reader built from the prebuilt index. + """ + index_dir = download_prebuilt_index(prebuilt_index_name) + return cls(index_dir) + + @staticmethod + def list_prebuilt_indexes(): + """Display available prebuilt indexes' information.""" + get_indexes_info() + def analyze(self, text: str, analyzer=None) -> List[str]: """Analyze a piece of text. Applies Anserini's default Lucene analyzer if analyzer not specified. diff --git a/pyserini/indexInfo.py b/pyserini/indexInfo.py new file mode 100644 index 000000000..8d45b3013 --- /dev/null +++ b/pyserini/indexInfo.py @@ -0,0 +1,74 @@ +INDEX_INFO = { + "robust04": { + "name": "robust04", + "description": "TREC Disks 4 & 5 (TREC 2004 Robust Track)", + "url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz"}, + "md5": "15f3d001489c97849a010b0a4734d018", + "downloaded": False, + "size compressed": "1821814915 bytes", + "size uncompressed": "2172142080 bytes", + "total_terms": 174540872, + "documents": 528030, + "non_empty_documents": 528030, + "unique_terms": 923436}, + "trec45": { + "name": "trec45", + "description": "TREC Disks 4 & 5 (TREC 2004 Robust Track)", + "url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz"}, + "md5": "15f3d001489c97849a010b0a4734d018", + "downloaded": False, + "size compressed": "1821814915 bytes", + "size uncompressed": "2172142080 bytes", + "total_terms": 174540872, + "documents": 528030, + "non_empty_documents": 528030, + "unique_terms": 923436}, + "ms-marco-passage": { + "name": "ms-marco-passage", + "description": "MS MARCO Passage Dataset", + "url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz"}, + "md5": "3c2ef64ee6d0ee8e317adcb341b92e28", + "downloaded": False, + "size compressed": "2153209812 bytes", + "size uncompressed": "2675783168 bytes", + "total_terms": 352316036, + "documents": 8841823, + "non_empty_documents": 8841823, + "unique_terms": -1}, + "ms-marco-doc": { + "name": "ms-marco-doc", + "description": "MS MARCO Doc Dataset", + "url": {"dropbox": "https://www.dropbox.com/s/awukuo8c0tkl9sc/index-msmarco-doc-20200527-a1ecfa.tar.gz?dl=1"}, + "md5": "72b1a0f9a9094a86d15c6f4babf8967a", + "downloaded": False, + "size compressed": "13661943256 bytes", + "size uncompressed": "16769683456 bytes", + "total_terms": 2748636047, + "documents": 3213835, + "non_empty_documents": 3213835, + "unique_terms": -1}, + "enwiki-paragraphs": { + "name": "lucene-index.enwiki-20180701-paragraphs", + "description": "English Wikipedia", + "url": {"dropbox": "https://www.dropbox.com/s/b7qqaos9ot3atlp/lucene-index.enwiki-20180701-paragraphs.tar.gz?dl=1"}, + "md5": "77d1cd530579905dad2ee3c2bda1b73d", + "downloaded": False, + "size compressed": "17725958785 bytes", + "size uncompressed": "21854924288 bytes", + "total_terms": 1498980668, + "documents": 39880064, + "non_empty_documents": 39879903, + "unique_terms": -1}, + "zhwiki-paragraphs": { + "name": "lucene-index.zhwiki-20181201-paragraphs", + "description": "Chinese Wikipedia", + "url": {"dropbox": "https://www.dropbox.com/s/6zn16mombt0wirs/lucene-index.zhwiki-20181201-paragraphs.tar.gz?dl=1"}, + "md5": "c005af4036296972831288c894918a92", + "downloaded": False, + "size compressed": "3284531213 bytes", + "size uncompressed": "3893332992 bytes", + "total_terms": 320776789, + "documents": 4170312, + "non_empty_documents": 4170301, + "unique_terms": -1} +} diff --git a/pyserini/search/_searcher.py b/pyserini/search/_searcher.py index 6d6b6db7d..68286eed2 100644 --- a/pyserini/search/_searcher.py +++ b/pyserini/search/_searcher.py @@ -26,7 +26,7 @@ from pyserini.pyclass import autoclass, JString, JArrayList from pyserini.trectools import TrecRun from pyserini.fusion import FusionMethod, reciprocal_rank_fusion -from pyserini.util import download_prebuilt_index +from pyserini.util import download_prebuilt_index, get_indexes_info logger = logging.getLogger(__name__) @@ -51,9 +51,26 @@ def __init__(self, index_dir: str): @classmethod def from_prebuilt_index(cls, prebuilt_index_name: str): + """Build a searcher from the prebuilt index, download the index if necessary. + + Parameters + ---------- + prebuilt_index_name : str + Prebuilt index name. + + Returns + ------- + SimpleSearcher + Searcher built from the prebuilt index. + """ index_dir = download_prebuilt_index(prebuilt_index_name) return cls(index_dir) + @staticmethod + def list_prebuilt_indexes(): + """Display available prebuilt indexes' information.""" + get_indexes_info() + def search(self, q: Union[str, JQuery], k: int = 10, query_generator: JQueryGenerator = None, strip_segment_id=False, remove_dups=False) -> List[JSimpleSearcherResult]: """Search the collection. diff --git a/pyserini/util.py b/pyserini/util.py index 2dae1cde1..c738eb424 100644 --- a/pyserini/util.py +++ b/pyserini/util.py @@ -21,25 +21,8 @@ import tarfile from tqdm import tqdm from urllib.request import urlretrieve - -INDEX_INFO = { - 'index-marco-passage': { - 'urls': {'uwaterloo': 'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz'}, - 'md5': '3c2ef64ee6d0ee8e317adcb341b92e28'}, - 'index-marco-doc': { - 'urls': {'dropbox': 'https://www.dropbox.com/s/awukuo8c0tkl9sc/index-msmarco-doc-20200527-a1ecfa.tar.gz?dl=1'}, - 'md5': '72b1a0f9a9094a86d15c6f4babf8967a'}, - 'index-robust04': { - 'urls': {'uwaterloo': 'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz'}, - 'md5': '15f3d001489c97849a010b0a4734d018'} -} - -INDEX_MAPPING = { - 'ms-marco-passage': INDEX_INFO['index-marco-passage'], - 'ms-marco-doc': INDEX_INFO['index-marco-doc'], - 'trec45': INDEX_INFO['index-robust04'], - 'robust04': INDEX_INFO['index-robust04'] -} +import pandas as pd +from pyserini.indexInfo import INDEX_INFO # https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5 @@ -99,7 +82,7 @@ def download_and_unpack_index(url, index_directory='indexes', force=False, verbo if prebuilt: index_directory = os.path.join(get_cache_home(), 'indexes') - index_path = os.path.join(index_directory, f'{index_name}{md5}') + index_path = os.path.join(index_directory, f'{index_name}.{md5}') local_tarball = os.path.join(index_directory, f'{index_name}.tar.gz') if not os.path.exists(index_directory): os.makedirs(index_directory) @@ -139,15 +122,33 @@ def download_and_unpack_index(url, index_directory='indexes', force=False, verbo os.rename(os.path.join(index_directory, f'{index_name}'), index_path) return index_path +def check_downloaded(index_name): + mirror = next(iter(INDEX_INFO[index_name]["url"])) + index_url = INDEX_INFO[index_name]["url"][mirror] + index_md5 = INDEX_INFO[index_name]["md5"] + index_name = index_url.split('/')[-1] + index_name = re.sub('''.tar.gz.*$''', '', index_name) + index_directory = os.path.join(get_cache_home(), 'indexes') + index_path = os.path.join(index_directory, f'{index_name}.{index_md5}') + return os.path.exists(index_path) + +def get_indexes_info(): + indexDf = pd.DataFrame.from_dict(INDEX_INFO) + for index in indexDf.keys(): + indexDf[index]['downloaded'] = check_downloaded(index) + with pd.option_context('display.max_rows', None, 'display.max_columns', \ + None, 'display.max_colwidth', -1, 'display.colheader_justify', 'left'): + print(indexDf) def download_prebuilt_index(index_name, force=False, verbose=True, mirror=None): - if index_name in INDEX_MAPPING: + if index_name in INDEX_INFO: if not mirror: - mirror = next(iter(INDEX_MAPPING[index_name]["urls"])) - elif mirror not in INDEX_MAPPING[index_name]["urls"]: + mirror = next(iter(INDEX_INFO[index_name]["url"])) + elif mirror not in INDEX_INFO[index_name]["url"]: raise ValueError("unrecognized mirror name {}".format(mirror)) - index_url = INDEX_MAPPING[index_name]["urls"][mirror] - index_md5 = INDEX_MAPPING[index_name]["md5"] + index_url = INDEX_INFO[index_name]["url"][mirror] + index_md5 = INDEX_INFO[index_name]["md5"] return download_and_unpack_index(index_url, prebuilt=True, md5=index_md5) else: raise ValueError("unrecognized index name {}".format(index_name)) +