Skip to content

Commit

Permalink
added retweet/quoted skipping and TweetReader unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandra DeLucia committed Sep 1, 2020
1 parent e28c697 commit 92a6652
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 2 deletions.
17 changes: 15 additions & 2 deletions littlebird/tweet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
# Standard imports
import logging
import os
import sys
import gzip
import zlib
Expand All @@ -25,12 +26,19 @@ class TweetReader:
def __init__(self, filename: str):
self.path = filename

# Check if file exists
if not os.path.exists(self.path):
logging.error(f"File {self.path} does not exist.")
sys.exit(1)

# Get filetype
try:
self.ftype = filetype.guess(filename).extension
except AttributeError as err:
# filetype could not be determined
self.ftype = None


# Open file
try:
if self.ftype == "gz":
self.f = gzip.open(filename, "r")
Expand All @@ -40,10 +48,15 @@ def __init__(self, filename: str):
logging.error(f"Issue opening {filename}:\n{err}")
sys.exit(1)

def read_tweets(self) -> Iterable[Any]:
def read_tweets(self,
skip_retweeted_and_quoted: bool = False
) -> Iterable[Any]:
try:
with jl.Reader(self.f) as reader:
for tweet in reader.iter(skip_empty=True, skip_invalid=True):
if skip_retweeted_and_quoted:
if "quoted_status" in tweet or "retweeted_status" in tweet:
continue
yield tweet
except (UnicodeDecodeError, gzip.BadGzipFile, zlib.error) as err:
logging.error(f"Error reading {self.path} of type {self.ftype}: {err}")
Expand Down
30 changes: 30 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
"""
Tests for littlebird/tweet_utils.py
Author: Alexandra DeLucia
"""
import unittest
from typing import List, Dict, Any

import littlebird
from littlebird import TweetReader
from littlebird import TweetWriter


class TestTweetReader(unittest.TestCase):
def test_file_not_found(self):
with self.assertRaises(SystemExit):
reader = TweetReader("does_not_exist.json")

def test_skip_retweets(self):
reader = TweetReader("demo_tweets.json")
for tweet in reader.read_tweets(skip_retweeted_and_quoted=True):
self.assertEqual(tweet.get("retweeted_status", False), False)
self.assertEqual(tweet.get("quoted_status", False), False)


if __name__ == "__main__":
unittest.main()


0 comments on commit 92a6652

Please sign in to comment.