Skip to content

Commit

Permalink
added replacement strings to token pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandra DeLucia committed Jun 21, 2020
1 parent 3cfc636 commit 0440221
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion littlebird/tweet_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def __init__(
"|".join([self.HANDLE_RE, self.URL_RE, self.RT_RE, "(&amp)"])
)
self.WHITESPACE_RE = regex.compile(r"\s+")
self.TOKEN_RE = regex.compile(token_pattern)

self.remove_hashtags = remove_hashtags
self.lowercase = lowercase
self.expand_contractions = expand_contractions
Expand All @@ -209,8 +209,17 @@ def __init__(
else:
self.stopwords = None
self.include_retweet_and_quoted_content = include_retweet_and_quoted_content

self.handle_sub = replace_usernames_with
self.url_sub = replace_urls_with

# Add handle and URL replacement strings
# so they do not get parsed out
if self.handle_sub.strip() != "":
token_pattern += f"|{self.handle_sub}"
if self.url_sub.strip() != "":
token_pattern += f"|{self.url_sub}"
self.TOKEN_RE = regex.compile(token_pattern)

def tokenize(self, tweet: str) -> List[str]:
"""
Expand Down

0 comments on commit 0440221

Please sign in to comment.