-
Notifications
You must be signed in to change notification settings - Fork 294
/
tm_examples.R
154 lines (119 loc) · 4.36 KB
/
tm_examples.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
## The tm library and related plugins comprise R's most popular text-mining stack.
## See http://cran.r-project.org/web/packages/tm/vignettes/tm.pdf
library(tm)
library(tidyverse)
library(slam)
library(proxy)
## tm has many "reader" functions. Each one has
## arguments elem, language, id
## (see ?readPlain, ?readPDF, ?readXML, etc)
## This wraps another function around readPlain to read
## plain text documents in English.
# I've stored this function as a Github "gist" at:
# https://gist.github.com/jgscott/28d9d1287a0c3c1477e2113f6758d5ff
readerPlain = function(fname){
readPlain(elem=list(content=readLines(fname)),
id=fname, language='en') }
## Test it on Adam Smith
adam = readerPlain("../data/division_of_labor.txt")
adam
meta(adam)
content(adam)
## apply to all of Simon Cowell's articles
## (probably not THE Simon Cowell)
## "globbing" = expanding wild cards in filename paths
file_list = Sys.glob('../data/ReutersC50/C50train/SimonCowell/*.txt')
simon = lapply(file_list, readerPlain)
# The file names are ugly...
file_list
# Clean up the file names
mynames = file_list %>%
{ strsplit(., '/', fixed=TRUE) } %>%
{ lapply(., tail, n=2) } %>%
{ lapply(., paste0, collapse = '') } %>%
unlist
# Rename the articles
mynames
names(simon) = mynames
## once you have documents in a vector, you
## create a text mining 'corpus' with:
documents_raw = Corpus(VectorSource(simon))
## Some pre-processing/tokenization steps.
## tm_map just maps some function to every document in the corpus
my_documents = documents_raw
my_documents = tm_map(my_documents, content_transformer(tolower)) # make everything lowercase
my_documents = tm_map(my_documents, content_transformer(removeNumbers)) # remove numbers
my_documents = tm_map(my_documents, content_transformer(removePunctuation)) # remove punctuation
my_documents = tm_map(my_documents, content_transformer(stripWhitespace)) ## remove excess white-space
## Remove stopwords. Always be careful with this!
stopwords("en")
stopwords("SMART")
?stopwords
my_documents = tm_map(my_documents, content_transformer(removeWords), stopwords("en"))
## create a doc-term-matrix
DTM_simon = DocumentTermMatrix(my_documents)
DTM_simon # some basic summary statistics
class(DTM_simon) # a special kind of sparse matrix format
## You can inspect its entries...
inspect(DTM_simon[1:10,1:20])
## ...find words with greater than a min count...
findFreqTerms(DTM_simon, 50)
## ...or find words whose count correlates with a specified word.
findAssocs(DTM_simon, "genetic", .5)
## Drop those terms that only occur in one or two documents
## This is a common step: the noise of the "long tail" (rare terms)
## can be huge, and there is nothing to learn if a term occurred once.
## Below removes those terms that have count 0 in >95% of docs.
## Probably a bit extreme in most cases... but here only 50 docs!
DTM_simon = removeSparseTerms(DTM_simon, 0.95)
DTM_simon # now ~ 1000 terms (versus ~3000 before)
# construct TF IDF weights
tfidf_simon = weightTfIdf(DTM_simon)
####
# Compare documents
####
inspect(tfidf_simon[1,])
# could go back to the raw corpus
content(simon[[1]])
####
# Dimensionality reduction
####
# Now PCA on term frequencies
X = as.matrix(tfidf_simon)
summary(colSums(X))
scrub_cols = which(colSums(X) == 0)
X = X[,-scrub_cols]
pca_simon = prcomp(X, rank=2, scale=TRUE)
plot(pca_simon)
# Look at the loadings
pca_simon$rotation[order(abs(pca_simon$rotation[,1]),decreasing=TRUE),1][1:25]
pca_simon$rotation[order(abs(pca_simon$rotation[,2]),decreasing=TRUE),2][1:25]
## Look at the first two PCs..
# We've now turned each document into a single pair of numbers -- massive dimensionality reduction
pca_simon$x[,1:2]
plot(pca_simon$x[,1:2], xlab="PCA 1 direction", ylab="PCA 2 direction", bty="n",
type='n')
text(pca_simon$x[,1:2], labels = 1:length(simon), cex=0.7)
# Both about "Scottish Amicable"
content(simon[[46]])
content(simon[[48]])
# Both about genetic testing
content(simon[[25]])
content(simon[[26]])
# Both about Ladbroke's merger
content(simon[[10]])
content(simon[[11]])
# Conclusion: even just these two-number summaries still preserve a lot of information
#####
# Cluster documents
#####
# define the distance matrix
# using the PCA scores
dist_mat = dist(pca_simon$x)
tree_simon = hclust(dist_mat)
plot(tree_simon)
clust5 = cutree(tree_simon, k=5)
# inspect the clusters
which(clust5 == 3)
content(simon[[18]])
content(simon[[19]])