Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandra DeLucia committed Jul 1, 2020
1 parent 0440221 commit ce5e018
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 12 deletions.
27 changes: 15 additions & 12 deletions littlebird/tweet_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,11 @@ def __init__(
self.language = language

# Handle pattern from NLTK
self.HANDLE_RE = r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
self.URL_RE = r"http(s)?:\/\/[\w\.\/\?\=]+"
self.RT_RE = r"\bRT\b"
self.HANDLE_RE = regex.compile(r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)")
self.URL_RE = regex.compile(r"http(s)?:\/\/[\w\.\/\?\=]+")
self.RT_RE = regex.compile(r"\bRT\b")
self.HASHTAG_RE = regex.compile(r"#[\p{L}\p{N}_]+")
self.LONE_DIGIT_RE = regex.compile(r"\b\d+\b")
self.REMOVAL_RE = regex.compile(
"|".join([self.HANDLE_RE, self.URL_RE, self.RT_RE, "(&amp)"])
)
self.WHITESPACE_RE = regex.compile(r"\s+")

self.remove_hashtags = remove_hashtags
Expand Down Expand Up @@ -238,6 +235,9 @@ def tokenize(self, tweet: str) -> List[str]:
# Remove "RT"
tweet = self.RT_RE.sub(" ", tweet)

# Remove pesky ampersand
tweet = regex.sub("(&amp)", " ", tweet)

# Lowercase
if self.lowercase:
tweet = tweet.lower()
Expand Down Expand Up @@ -332,22 +332,25 @@ def get_hashtags(self, tweet: Dict[str, Any]) -> List[str]:
"""
# Check if tweet is truncated
if tweet.get("truncated", False):
hashtags = tweet["extended_tweet"]["entities"]["hashtags"]["text"]
hashtags = tweet["extended_tweet"]["entities"]["hashtags"]
else:
text = tweet["entities"]["hashtags"]["text"]
hashtags = tweet["entities"]["hashtags"]

# Include retweeted/quoted content
if self.include_retweet_and_quoted_content:
if "quoted_status" in tweet:
if tweet["quoted_status"].get("extended_tweet", False):
hashtags.extend(tweet["quoted_status"]["extended_tweet"]["entities"]["hashtags"]["text"])
hashtags.extend(tweet["quoted_status"]["extended_tweet"]["entities"]["hashtags"])
else:
hashtags.extend(tweet["quoted_status"]["entities"]["hashtags"]["text"])
hashtags.extend(tweet["quoted_status"]["entities"]["hashtags"])
if "retweeted_status" in tweet:
if tweet["retweeted_status"].get("extended_tweet", False):
hashtags.extend(tweet["retweeted_status"]["extended_tweet"]["entities"]["hashtags"]["text"])
hashtags.extend(tweet["retweeted_status"]["extended_tweet"]["entities"]["hashtags"])
else:
hashtags.extend(tweet["retweeted_status"]["entities"]["hashtags"]["text"])
hashtags.extend(tweet["retweeted_status"]["entities"]["hashtags"])

# Only return the text
hashtags = [h.get("text") for h in hashtags]
return hashtags


5 changes: 5 additions & 0 deletions tests/test_tweet_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,16 @@ def test_get_tweet_text(self):
all_text = tokenizer.get_tweet_text(tweet)
self.assertEqual(all_text, right_answer)

def test_get_tokenized_tweet_text(self):
return

def test_tokenize_file(self):
tokenizer = TweetTokenizer()
text = tokenizer.tokenize_tweet_file("demo_tweets.json")

def test_get_hashtags(self):
return


if __name__ == "__main__":
unittest.main()
Expand Down

0 comments on commit ce5e018

Please sign in to comment.