Skip to content

Commit

Permalink
get_tokenized_tweet_text
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandra DeLucia committed Jun 21, 2020
1 parent 86a422c commit fa28eed
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions littlebird/tweet_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,16 @@ def get_tweet_text(self, tweet: Dict[str, Any]) -> List[str]:
text += f" {tweet['retweeted_status']['text']}"
return text

def get_tokenized_tweet_text(self, tweet: Dict[str, Any]) -> str:
"""
Convenience method. Returns all tokenized text from the Tweet.
Same as calling "get_tweet_text" and "tokenize" and then joining
the token list into a string.
"""
text = self.get_tweet_text(tweet)
tokens = self.tokenize(text)
return " ".join(tokens)

def tokenize_tweet_file(
self, input_file: str, sample_size: int = -1, return_tokens: bool = False
) -> Union[List[str], List[List[str]]]:
Expand Down

0 comments on commit fa28eed

Please sign in to comment.