Skip to content

Commit

Permalink
Do not remove newline in Wayback download script
Browse files Browse the repository at this point in the history
  • Loading branch information
tuzhucheng committed Jan 4, 2021
1 parent dc9cf6b commit 0c5e226
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
2 changes: 1 addition & 1 deletion collection/download_wayback_passages.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def extract_text(html_text: str) -> str:
output = ''
for t in text:
if t.parent.name not in blacklist:
output += f'{t.strip()} '
output += f'{t} '

return output

Expand Down
2 changes: 1 addition & 1 deletion collection/paragraph_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@


def chunk_doc(content: str) -> List[str]:
"""Given a document, return a list of passages of no more than MIN_PASSAGE_TOKENS tokens."""
"""Given a document, return a list of passages of no fewer than MIN_PASSAGE_TOKENS tokens / passage until EOF."""
passages = []
passage_tokens = []
lines = content.split('\n')
Expand Down

0 comments on commit 0c5e226

Please sign in to comment.