updated README with examples

AADeLucia · Jun 7, 2020 · b152396 · b152396
1 parent 1fde4af
commit b152396
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # Misc cache
 *.swp
+*.nfs*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -1,40 +1,108 @@
 # Little Bird
 
 Basic utilties for processing Tweets. Includes:  
-* TweetTokenizer for tokenizing the Tweet content
-* TweetReader for easily iterating over Tweets
-* TweetWriter for conveniently writing Tweets to a file
+* `TweetTokenizer` for tokenizing the Tweet content
+* `TweetReader` for easily iterating over Tweets
+* `TweetWriter` for conveniently writing one or more Tweets to a file in JSONlines format
 
 ## Installation
+There are two options for installing `littlebird`.
+
+### 1. Install from source
 ```bash
 git clone https://github.com/AADeLucia/littlebird.git
 cd littlebird
 python setup.py develop
 ```
 
-## Read a Tweet JSONlines file
-The example below reads in a Tweet file and tokenizes all of the text.
+### 2. Install with `pip`
+```bash
+pip install git+git://github.com/AADeLucia/littlebird.git#egg=littlebird
+```
+
+
+## Read and Write a Tweet JSONlines file
+The example below reads in a Tweet file, filters to Tweets that have a hashtag, and writes out to a new file.
+
+`TweetWriter`can write a single Tweet or list of Tweets to a file in JSONlines format. It will also automatically open a GZIP file if the provided filename has a `.gz` extension. If you are writing to a GZIP file, it is recommended to write all Tweets at once instead of writing incrementally; this provides better file compression. If you do need to write incrementally, I recommend writing to a normal file and GZIPping after.
 
 ```python
-from littlebird import TweetReader
-from littlebird import TweetTokenizer
+from littlebird import TweetReader, TweetWriter
+
+# File in JSONlines form. Automatically handles GZIP files.
+tweet_file = "2014_01_02.json.gz"
+reader = TweetReader(tweet_file)
 
-# Initialize tokenizer with default settings
-tokenizer = TweetTokenizer()
+# Iterate over Tweets
+# Save Tweets that contain hashtags
+filtered_tweets = []
+for tweet in reader.read_tweets():
+    if tweet.get("truncated", False):
+        num_hashtags = len(tweet["extended_tweet"]["entities"]["hashtags"])
+    else:
+        num_hashtags = len(tweet["entities"]["hashtags"])
+
+    if num_hashtags > 0:
+        filtered_tweets.append(tweet)
+
+# Write out filtered Tweets
+writer = TweetWriter("filtered.json")
+writer.write(tweets)
+```
+
+
+## Tokenize Tweets
+
+### Example usage
+A basic example using the default Tokenizer settings is below.
+
+```python
+from littlebird import TweetReader, TweetTokenier
 
 # File in JSONlines form. Automatically handles GZIP files.
 tweet_file = "2014_01_02.json.gz"
 reader = TweetReader(tweet_file)
 
 # Iterate over Tweets
+# Make sure to check for the "truncated" field otherwise you will only access the 
+# 140 character Tweet, not the full 280 character message
 for tweet in reader.read_tweets():
     if tweet.get("truncated", False):
         text = tweet["extended_tweet"]["full_text"]
     else:
         text = tweet["text"]
 
+    # Tokenize the Tweet's text
     tokens = tokenizer.tokenize(text)
 ```
 
+### Tokenizer settings
+Available `TweetTokenizer` settings:
+
+* `language`: right now it only really supports English, but as long as you change the `token_pattern` accordingly, it should work with other languages. A future integration is using `Moses` for Arabic tokenization.
+* `token_pattern`: Pattern to match for acceptable tokens. Default is `r"\b\w+\b"`
+* `stopwords`: provide a list of stopwords to remove from the text. Default is `None` for no stopword removal.
+* `remove_hashtags`: Default is `False` to keep hashtags in the text (only strips the "#" symbol)
+* `lowercase`: Default is `True` to lowercase all of the text. Change this to `False` if you are doing case-sensitive tasks like Name Entity Recognition (NER)
+
+The tokenizer works in the following steps:
+
+1. Remove hashtags (optional)
+2. Remove URLs, handles, and "RT"
+3. Lowercase the text (optional)
+4. Find all tokens that match the `token_pattern` with `regex.findall(token_pattern, text)`
+5. Remove stopwords (optional)
+
+
+#### Token patterns
+The token pattern is extremely important to set for your use case. Below are some sample token patterns, but I highly recommend [refreshing on your regular expressions](http://www.regular-expressions.info/tutorial.html) if you need something more advanced.
+
+**Note:** the `regex` package is used to access character classes like `\p{L}`. Basically [Java regex patterns](http://www.regular-expressions.info/tutorial.html).
+
+* `r"\b\w+\b"` matches any token with one or more letters, numbers, and underscores. This is equivalent to `"[\p{L}\_\p{N}]+"`
+* `r"\b\p{L}+\b"` matches any token with one or more "letters" (across all alphabets).
+
+This package is a work in progress. Feel free to open any issues you run into or any feature requests. I started this package as an inbetween for something lighter than Twokenizer but more customizable than NLTK.
+
 Copyright (c) 2020 Alexandra DeLucia
 
diff --git a/littlebird/tweet_tokenizer.py b/littlebird/tweet_tokenizer.py
@@ -1,5 +1,7 @@
 """
 General utilities to open a Twitter file.
+
+Author: Alexandra DeLucia
 """
 # Standard imports
 import os
@@ -49,7 +51,7 @@ def __init__(self,
         Only letters: "\p{L}+"
         Letters and numbers: "[\p{L}\p{N}]+"
         Starts with a letter but can contain numbers: "\p{L}[\p{L}\p{N}]+"
-        The default stopwords None uses the ....
+        The default stopwords None does not remove stopwords
         User handle pattern: r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_])    {20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-    z0-9_]*@)"
         Retweet pattern: r"\bRT\b"
         URL pattern: r"http(s)?:\/\/[\w\.\/\?\=]+" 
@@ -60,7 +62,7 @@ def __init__(self,
         else:
             self.language = language
 
-        # Forgot where I got this pattern from... Mark? Looks messy
+        # Handle pattern from NLTK
         self.HANDLE_RE = r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
         self.URL_RE = r"http(s)?:\/\/[\w\.\/\?\=]+"
         self.RT_RE = r"\bRT\b"
@@ -70,6 +72,10 @@ def __init__(self,
         self.TOKEN_RE = regex.compile(token_pattern)
         self.remove_hashtags = remove_hashtags
         self.lowercase = lowercase
+        if stopwords is not None:
+            self.stopwords = set(stopwords)
+        else:
+            self.stopwords = None
         return
 
     def tokenize(self, tweet):
@@ -79,11 +85,22 @@ def tokenize(self, tweet):
         """
         if self.remove_hashtags:
             tweet = self.HASHTAG_RE.sub(" ", tweet)
+
+        # Remove URLs, handles, "RT"
         tweet = self.REMOVAL_RE.sub(" ", tweet)
-        tweet = self.WHITESPACE_RE.sub(" ", tweet)
+
+        # Lowercase
         if self.lowercase:
             tweet = tweet.lower()
-        return self.TOKEN_RE.findall(tweet)
+
+        # Tokenize
+        tokens = self.TOKEN_RE.findall(tweet)
+
+        # Remove stopwords
+        if self.stopwords:
+            tokens = [t for t in tokens if t not in self.stopwords]
+        return tokens
+
 
     def tokenize_tweet_file(self, input_file, sample_size=-1, return_tokens=False):
         """