Skip to content

Commit

Permalink
added BERTweet tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandra DeLucia committed Jul 28, 2020
1 parent 089d152 commit 20579cd
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 1 deletion.
44 changes: 44 additions & 0 deletions littlebird/tweet_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import urllib.request
import urllib.error
from lxml.html import parse
from nltk.tokenize import TweetTokenizer
from emoji import demojize

# Local modules
from littlebird import TweetReader
Expand Down Expand Up @@ -381,3 +383,45 @@ def _replace_uppercase_tokens(self, match) -> str:
return token.lower() + " <allcaps> "


class BERTweetTokenizer(BaseTweetTokenizer):
"""
Tokenizer to pre-process Tweets before getting BERTweet embeddings.
Full tokenizer here: https://github.com/VinAIResearch/BERTweet/blob/master/TweetNormalizer.py
"""
def __init__(self, include_retweeted_and_quoted_content: bool = True):
# Initialize base class
super().__init__(include_retweeted_and_quoted_content)
self.nltk_tokenizer = TweetTokenizer()

def tokenize(self, tweet: str) -> List[str]:
tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
normTweet = " ".join([normalizeToken(token) for token in tokens])

normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
normTweet = normTweet.replace(" p . m .", " p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

normTweet = regex.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
normTweet = regex.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
normTweet = regex.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)

return normTweet.split()

def normalize_token(self, token: str) -> str:
lowercased_token = token.lower()
if token.startswith("@"):
return "@USER"
elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
return "HTTPURL"
elif len(token) == 1:
return demojize(token)
else:
if token == "’":
return "'"
elif token == "…":
return "..."
else:
return token


2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
long_description_content_type="text/markdown",
url="",
packages=setuptools.find_packages(),
install_requires=["jsonlines", "regex", "filetype", "lxml"],
install_requires=["jsonlines", "regex", "filetype", "lxml", "emoji", "nltk"],
license="MIT",
classifiers=[
"Programming Language :: Python :: 3",
Expand Down

0 comments on commit 20579cd

Please sign in to comment.