-
Notifications
You must be signed in to change notification settings - Fork 1
/
topic_modeling.py
72 lines (53 loc) · 2.08 KB
/
topic_modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from __future__ import print_function
import pandas as pd
import pickle
# Import gensim for Latent Dirichlet Allocation
from gensim import corpora, models, similarities, matutils
# Logging for gensim (set to INFO)
import logging
(logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO))
from sklearn.feature_extraction.text import CountVectorizer
'''
Subject: Topics of Tweets of US Congress through the Lens of Campaign Finance
Date: 09/12/2018
Author: Edmund D. Chitwood
Summary:
The following Script:
- reads updated preprocessed Tweets into a Pandas DataFrame,
- vectorizes the word counts,
- performs LDA to derive topics from and assign topics to the Tweets, and
- stores topics and links to Tweets in a DataFrame and Pickles it.
'''
# Load Data
tweets = pd.read_pickle("updated_preprocessed_tweets.pkl")
# Create the document-term matrix with count vectorizer
cv = CountVectorizer()
matrix = cv.fit_transform(tweets.clean_text)
columns=cv.get_feature_names()
matrix = matrix.transpose()
corpus = matutils.Sparse2Corpus(matrix)
# Dict of unique words
id2word = dict(zip(list(range(0, 423988)), columns))
# Create Latent Dirichlet Allocation (LDA) model (same as "fit" in sklearn)
lda = (models.LdaModel(corpus=corpus, num_topics=50, id2word=id2word,
passes=20,eval_every=None,))
# Preserve model for further analysis
lda.save('lda.model')
# Transform the docs from the word space to the topic space
# (like "transform" in sklearn)
lda_corpus = lda[corpus]
# Store the documents' topic vectors in a list
lda_docs = [doc for doc in lda_corpus]
# Create DF to hold topics for each Tweet
topics = []
for i in lda_docs:
topics.append(dict(i))
topics_df = pd.DataFrame(topics)
# Combine topics DataFrame with links to actual Tweets
tweet_topics = pd.DataFrame()
tweet_topics['tweets'] = tweets.permalink
tweet_topics.reset_index(inplace=True)
tweets_links_topics = pd.concat([tweet_topics, topics_df], axis=1, sort=False)
# Preserve Tweets to topics mapping
tweets_links_topics.to_pickle('tweets_links_topics.pkl')