Add initial version of collection building script

apple · Dec 31, 2020 · 7248efe · 7248efe
1 parent 7a38d3a
commit 7248efe
Show file tree

Hide file tree

Showing 6 changed files with 698 additions and 0 deletions.
diff --git a/ACKNOWLEDGEMENTS b/ACKNOWLEDGEMENTS
@@ -0,0 +1,59 @@
+Acknowledgements
+
+Portions of ml-qrecc may utilize the following copyrighted
+material, the use of which is hereby acknowledged.
+
+_____________________
+
+Leonard Richardson and contributors (Beautiful Soup)
+    The MIT License
+
+    Copyright (c) 2004-2020 Leonard Richardson and contributors
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+    THE SOFTWARE.
+
+Pyserini Authors (Pyserini)
+   Copyright 2019-2020, The Pyserini Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+warcio Authors (warcio)
+   Copyright 2017-2020, The warcio Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/collection/README.md b/collection/README.md
@@ -0,0 +1,40 @@
+# Building the Collection
+
+This directory contains the scripts and instructions for downloading, processing, and building the collection for our baseline.
+
+The collection consists of webpages from the Common Crawl and the Wayback Machine.
+**Please note that the Common Crawl collection is [quite large](https://commoncrawl.org/2019/11/november-2019-crawl-archive-now-available/) (order of tens of TBs), so please check the data cap of your internet plan to make sure you stay within the limit.**
+
+To download pages from the Common Crawl, run the following command.
+This saves webpage documents in [JSON lines](https://jsonlines.org) (.jsonl) format into the `collection/commoncrawl` subdirectory.
+For us this took slightly over a day to run.
+
+```bash
+time python download_commoncrawl_passages.py --output-directory collection/commoncrawl --workers 8
+```
+
+To download pages from the Wayback Machine, run the following command after you've extracted the dataset.
+This saves webpage documents in .jsonl format into the `collection/wayback` subdirectory.
+For us this took 9 hours to run.
+
+```bash
+time python download_wayback_passages.py --inputs '../dataset/*.json' --output-directory collection/wayback --workers 4
+```
+
+Next we segmented the webpage documents into smaller passages.
+This is quite quick and took several minutes.
+
+```bash
+time python paragraph_chunker.py --input-directory collection --output-directory collection-paragraph --workers 8
+```
+
+Finally we indexed the passages using [Pyserini](https://github.com/castorini/pyserini/), a Python wrapper around [Anserini](http://anserini.io/), an information retrieval toolkit built on Lucene.
+Java (JDK) is needed as a pre-requisite. 
+After installing Pyserini we used the following command to build the index.
+For us this took less than 2 hours.
+
+```bash
+time python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
+ -threads 76 -input collection-paragraph \
+ -index index-paragraph -storePositions -storeDocvectors -storeRaw
+```
diff --git a/collection/download_commoncrawl_passages.py b/collection/download_commoncrawl_passages.py
@@ -0,0 +1,254 @@
+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2020 Apple Inc. All Rights Reserved.
+#
+
+"""
+This script creates a corpus of documents from the November 2019 Common Crawl archive.
+"""
+
+from argparse import ArgumentParser
+from collections import defaultdict
+import gzip
+import json
+import logging
+from multiprocessing import Pool
+import os
+from pathlib import Path
+import re
+import shutil
+import tempfile
+from typing import Dict, List, Tuple
+import urllib.request
+
+from warcio.archiveiterator import ArchiveIterator
+
+index_files_root = Path('index-files')
+filter_lists_root = Path('filter-lists')
+sampled_filter_lists_root = Path('filter-lists-sampled')
+wet_files_cache = Path('wet-files')
+
+
+def get_cc_index_paths() -> List[str]:
+    """Get a list of paths for Common Crawl URL index files."""
+    index_paths = []
+    with tempfile.NamedTemporaryFile() as temp_f:
+        urllib.request.urlretrieve(
+            'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-47/cc-index.paths.gz',
+            temp_f.name,
+        )
+        with gzip.open(temp_f.name, 'rb') as f:
+            for line in f:
+                line = line.decode('utf-8').rstrip()
+                if line.endswith('.gz'):
+                    index_paths.append(f'https://commoncrawl.s3.amazonaws.com/{line}')
+
+    return index_paths
+
+
+def get_cc_wet_paths() -> Dict[str, str]:
+    """Get a dict of WET file name to WET URL."""
+    wet_urls = {}
+    with tempfile.NamedTemporaryFile() as temp_f:
+        urllib.request.urlretrieve(
+            'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-47/wet.paths.gz',
+            temp_f.name,
+        )
+        with gzip.open(temp_f.name, 'rb') as f:
+            for line in f:
+                line = line.decode('utf-8').rstrip()
+                filename = line.split('/')[-1]
+                wet_urls[filename] = f'https://commoncrawl.s3.amazonaws.com/{line}'
+
+    return wet_urls
+
+
+def process_cc_index(index_url: str) -> Dict[str, List[str]]:
+    """Return a map of WET file to list of URLs it contains."""
+    # Download index file
+    filename = index_url.split('/')[-1]
+    index_files_root.mkdir(exist_ok=True)
+    if not (index_files_root / filename).exists():
+        urllib.request.urlretrieve(index_url, index_files_root / filename)
+
+    # Parse index file
+    wet_to_urls = defaultdict(list)
+    cc_index_line_pattern = re.compile(r'^[\S]+ \d+ (.*)$')
+    with gzip.open(index_files_root / filename, 'rb') as f:
+        for line in f:
+            line = line.decode('utf-8').rstrip()
+            match = cc_index_line_pattern.match(line)
+            if match:
+                url_metadata = json.loads(match.group(1))
+                if (
+                    url_metadata['status'] == '200'
+                    and url_metadata.get('languages') == 'eng'
+                    and url_metadata['mime'] == 'text/html'
+                ):
+                    wet_filename = url_metadata['filename'].split('/')[-1]
+                    wet_to_urls[wet_filename].append(url_metadata['url'])
+            else:
+                logging.error(f'Line in index file cannot be matched by regex: {line}')
+
+    return wet_to_urls
+
+
+def sort_and_sample_filter_list(filter_list_path: Path) -> None:
+    """Sort and sample URLs in a filter list."""
+    urls = []
+    with open(filter_list_path) as f:
+        for line in f:
+            urls.append(line.rstrip())
+
+    urls.sort()
+
+    with open(sampled_filter_lists_root / filter_list_path.name, 'w') as f:
+        for i, url in enumerate(urls):
+            if i % 100 == 0:
+                f.write(url + '\n')
+
+
+def sample_filter_lists() -> None:
+    """Sample filter lists."""
+    filter_lists = list(filter_lists_root.iterdir())
+    sampled_filter_lists_root.mkdir(exist_ok=True)
+
+    with Pool() as p:
+        p.map(sort_and_sample_filter_list, filter_lists)
+
+
+def process_wet_file(tup: Tuple[Path, str, str, Path],) -> None:
+    """Download WET file and extract webpages from WARC whose URL is in the filter list."""
+    filter_list, wet_name, wet_url, commoncrawl_docs_root = tup
+    accepted_urls = set()
+    with open(filter_list) as f:
+        for line in f:
+            accepted_urls.add(line.rstrip())
+
+    attempt = 0
+    while attempt < 3:
+        try:
+            urllib.request.urlretrieve(wet_url, wet_files_cache / wet_name)
+            break
+        except Exception:
+            logging.exception(f'Error while downloading {wet_url}')
+            attempt += 1
+
+    if not (wet_files_cache / wet_name).exists():
+        logging.error(
+            f'Failed to download {wet_url} after 3 attempts. Ignoring file...'
+        )
+        return
+
+    with gzip.open(wet_files_cache / wet_name, 'rb') as stream, open(
+        commoncrawl_docs_root / f'{wet_name}.jsonl', 'w'
+    ) as f:
+        for record in ArchiveIterator(stream):
+            if record.rec_type == 'conversion':
+                url = record.rec_headers.get_header('WARC-Target-URI')
+                if url not in accepted_urls:
+                    continue
+
+                contents = record.content_stream().read().decode('utf-8')
+                if contents.startswith('404 Not Found'):
+                    continue
+
+                output_dict = {'id': url, 'contents': contents}
+
+                f.write(json.dumps(output_dict) + '\n')
+
+    os.remove(wet_files_cache / wet_name)
+
+
+def get_docs_from_wet_files(parallelism, commoncrawl_docs_root: Path) -> None:
+    """Download WET files and extract webpages whose URLs is in the filter list."""
+    wet_files_cache.mkdir(exist_ok=True)
+    commoncrawl_docs_root.mkdir(exist_ok=True, parents=True)
+
+    filter_lists = list(sampled_filter_lists_root.iterdir())
+
+    # Download WET file paths
+    wet_paths = get_cc_wet_paths()
+    wet_names = []
+    resolved_wet_paths = []
+    for filter_list in filter_lists:
+        wet_filename = str(filter_list.name).replace('.warc.gz.txt', '.warc.wet.gz')
+        wet_names.append(wet_filename)
+        resolved_wet_paths.append(wet_paths[wet_filename])
+
+    with Pool(parallelism) as p:
+        for i, _ in enumerate(
+            p.imap_unordered(
+                process_wet_file,
+                zip(
+                    filter_lists,
+                    wet_names,
+                    resolved_wet_paths,
+                    [commoncrawl_docs_root for _ in range(len(filter_lists))],
+                ),
+            )
+        ):
+            if (i + 1) % 50 == 0:
+                logging.info(f'Processed {i + 1} / {len(filter_lists)} WET files...')
+
+
+def main(parallelism: int, commoncrawl_docs_root: Path):
+    cc_index_paths = get_cc_index_paths()
+
+    # Construct filter lists
+    if filter_lists_root.exists():
+        shutil.rmtree(filter_lists_root)
+    filter_lists_root.mkdir(exist_ok=True)
+
+    for i in range(0, len(cc_index_paths), parallelism):
+        with Pool(parallelism) as p:
+            logging.info(
+                f'Processing Common Crawl index {i+1}-{min(i + parallelism, len(cc_index_paths))} / {len(cc_index_paths)}...'
+            )
+            partial_filter_lists = p.map(
+                process_cc_index, cc_index_paths[i : i + parallelism]
+            )
+            for partial_filter_list in partial_filter_lists:
+                for wet_filename, urls in partial_filter_list.items():
+                    with open(filter_lists_root / f'{wet_filename}.txt', 'a') as f:
+                        for url in urls:
+                            f.writelines(url + '\n')
+
+    # Create sampled filter lists
+    logging.info('Sorting and sampling filter lists...')
+    sample_filter_lists()
+
+    # Download WET files and filter records
+    logging.info('Processing WET files...')
+    get_docs_from_wet_files(parallelism, commoncrawl_docs_root)
+
+    # Remove temporary files
+    logging.info('Done processing WET files, removing temporary directories...')
+    shutil.rmtree(index_files_root)
+    shutil.rmtree(filter_lists_root)
+    shutil.rmtree(sampled_filter_lists_root)
+    shutil.rmtree(wet_files_cache)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description='Creates a corpus of documents from the November 2019 Common Crawl archive'
+    )
+    parser.add_argument(
+        '--output-directory',
+        default='docs/common-crawl',
+        help='Path to directory containing document output, defaults to docs/common-crawl',
+    )
+    parser.add_argument(
+        '--workers',
+        default=8,
+        type=int,
+        help='Number of workers for downloading in parallel',
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+
+    commoncrawl_docs_root = Path(args.output_directory)
+
+    main(args.workers, commoncrawl_docs_root)