""" Periodically checks Twitter for tweets about arxiv papers we recognize and logs the tweets into mongodb database "arxiv", under "tweets" collection. """ import os import re import pytz import time import math import pickle import datetime from dateutil import parser import twitter # pip install python-twitter import pymongo from utils import Config # settings # ----------------------------------------------------------------------------- sleep_time = 60*10 # in seconds, between twitter API calls. Default rate limit is 180 per 15 minutes max_tweet_records = 15 # convenience functions # ----------------------------------------------------------------------------- def get_keys(): lines = open('twitter.txt', 'r').read().splitlines() return lines def extract_arxiv_pids(r): pids = [] for u in r.urls: m = re.search('arxiv.org/abs/(.+)', u.expanded_url) if m: rawid = m.group(1) pids.append(rawid) return pids def get_latest_or_loop(q): results = None while results is None: try: results = api.GetSearch(raw_query="q=%s&result_type=recent&count=100" % (q, )) except Exception as e: print('there was some problem (waiting some time and trying again):') print(e) time.sleep(sleep_time) return results epochd = datetime.datetime(1970,1,1,tzinfo=pytz.utc) # time of epoch def tprepro(tweet_text): # take tweet, return set of words t = tweet_text.lower() t = re.sub(r'[^\w\s]','',t) # remove punctuation ws = set([w for w in t.split() if not w.startswith('#')]) return ws # ----------------------------------------------------------------------------- # authenticate to twitter API keys = get_keys() api = twitter.Api(consumer_key=keys[0], consumer_secret=keys[1], access_token_key=keys[2], access_token_secret=keys[3]) # connect to mongodb instance client = pymongo.MongoClient() mdb = client.arxiv tweets = mdb.tweets # the "tweets" collection in "arxiv" database tweets_top1 = mdb.tweets_top1 tweets_top7 = mdb.tweets_top7 tweets_top30 = mdb.tweets_top30 print('mongodb tweets collection size:', tweets.count()) print('mongodb tweets_top1 collection size:', tweets_top1.count()) print('mongodb tweets_top7 collection size:', tweets_top7.count()) print('mongodb tweets_top30 collection size:', tweets_top30.count()) # load banned accounts banned = {} if os.path.isfile(Config.banned_path): with open(Config.banned_path, 'r') as f: lines = f.read().split('\n') for l in lines: if l: banned[l] = 1 # mark banned print('banning users:', list(banned.keys())) # main loop last_db_load = None while True: dnow_utc = datetime.datetime.now(datetime.timezone.utc) # fetch all database arxiv pids that we know about (and handle an upadte of the db file) if last_db_load is None or os.stat(Config.db_path).st_mtime > last_db_load: last_db_load = time.time() print('(re-) loading the paper database', Config.db_path) db = pickle.load(open(Config.db_path, 'rb')) # fetch the latest mentioning arxiv.org results = get_latest_or_loop('arxiv.org') to_insert = [] for r in results: arxiv_pids = extract_arxiv_pids(r) arxiv_pids = [p for p in arxiv_pids if p in db] # filter to those that are in our paper db if not arxiv_pids: continue # nothing we know about here, lets move on if tweets.find_one({'id':r.id}): continue # we already have this item if r.user.screen_name in banned: continue # banned user, very likely a bot # create the tweet. intentionally making it flat here without user nesting d = parser.parse(r.created_at) # datetime instance tweet = {} tweet['id'] = r.id tweet['pids'] = arxiv_pids # arxiv paper ids mentioned in this tweet tweet['inserted_at_date'] = dnow_utc tweet['created_at_date'] = d tweet['created_at_time'] = (d - epochd).total_seconds() # seconds since epoch tweet['lang'] = r.lang tweet['text'] = r.text tweet['user_screen_name'] = r.user.screen_name tweet['user_image_url'] = r.user.profile_image_url tweet['user_followers_count'] = r.user.followers_count tweet['user_following_count'] = r.user.friends_count to_insert.append(tweet) if to_insert: tweets.insert_many(to_insert) print('processed %d/%d new tweets. Currently maintaining total %d' % (len(to_insert), len(results), tweets.count())) # run over 1,7,30 days pid_to_words_cache = {} for days in [1,7,30]: tweets_top = {1:tweets_top1, 7:tweets_top7, 30:tweets_top30}[days] # precompute: compile together all votes over last 5 days dminus = dnow_utc - datetime.timedelta(days=days) relevant = tweets.find({'created_at_date': {'$gt': dminus}}) raw_votes, votes, records_dict = {}, {}, {} for tweet in relevant: # some tweets are really boring, like an RT tweet_words = tprepro(tweet['text']) isok = not(tweet['text'].startswith('RT') or tweet['lang'] != 'en' or len(tweet['text']) < 40) # give people with more followers more vote, as it's seen by more people and contributes to more hype float_vote = min(math.log10(tweet['user_followers_count'] + 1), 4.0)/2.0 for pid in tweet['pids']: if not pid in records_dict: records_dict[pid] = {'pid':pid, 'tweets':[], 'vote': 0.0, 'raw_vote': 0} # create a new entry for this pid # good tweets make a comment, not just a boring RT, or exactly the post title. Detect these. if pid in pid_to_words_cache: title_words = pid_to_words_cache[pid] else: title_words = tprepro(db[pid]['title']) pid_to_words_cache[pid] = title_words comment_words = tweet_words - title_words # how much does the tweet have other than just the actual title of the article? isok2 = int(isok and len(comment_words) >= 3) # add up the votes for papers tweet_sort_bonus = 10000 if isok2 else 0 # lets bring meaningful comments up front. records_dict[pid]['tweets'].append({'screen_name':tweet['user_screen_name'], 'image_url':tweet['user_image_url'], 'text':tweet['text'], 'weight':float_vote + tweet_sort_bonus, 'ok':isok2, 'id':str(tweet['id']) }) votes[pid] = votes.get(pid, 0.0) + float_vote raw_votes[pid] = raw_votes.get(pid, 0) + 1 # record the total amount of vote/raw_vote for each pid for pid in votes: records_dict[pid]['vote'] = votes[pid] # record the total amount of vote across relevant tweets records_dict[pid]['raw_vote'] = raw_votes[pid] # crop the tweets to only some number of highest weight ones (for efficiency) for pid, d in records_dict.items(): d['num_tweets'] = len(d['tweets']) # back this up before we crop d['tweets'].sort(reverse=True, key=lambda x: x['weight']) if len(d['tweets']) > max_tweet_records: d['tweets'] = d['tweets'][:max_tweet_records] # some debugging information votes = [(v,k) for k,v in votes.items()] votes.sort(reverse=True, key=lambda x: x[0]) # sort descending by votes print('top votes:', votes[:min(len(votes), 10)]) # write the results to mongodb if records_dict: tweets_top.delete_many({}) # clear the whole tweets_top collection tweets_top.insert_many(list(records_dict.values())) # insert all precomputed records (minimal tweets) with their votes # and sleep for a while print('sleeping', sleep_time) time.sleep(sleep_time)