textanalysis

Text Analysis in R via Julia.

Installation

Being a wrapper to a Julia package, textanalysis requires the latter to be installed.

# install.packages("remotes")
remotes::install_github("news-r/textanalysis") # github

Setup

You must run init_textanalysis at the begining of every session, you will otherwise encounter errors and be prompted to do so.

library(textanalysis) # load the package

init_textanalysis() # initialise
#> Julia version 1.1.1 at location /home/jp/Downloads/julia-1.1.1-linux-x86_64/julia-1.1.1/bin will be used.
#> Loading setup script for JuliaCall...
#> Finish loading setup script for JuliaCall.
#> ✔ textanalysis initialised.

Some funtions depend on the development version of the Julia package, to install it run:

install_textanalysis(version = "latest")

Basic Examples

# build document
str <- paste(
  "They <span>write</span>, it writes too!!!",
  "This is another sentence.",
  "More stuff in this document."
)
doc <- string_document(str)

# basic cleanup
prepare(doc)
#> ⚠ This function changes `document` in place!
get_text(doc)
#> [1] " write  writes     sentence  stuff   document"

# stem
stem_words(doc)
get_text(doc)
#> [1] "write write sentenc stuff document"

# corpus
doc2 <- token_document("Hey write another document.")

# combine
corpus <- corpus(doc, doc2)

# standardize
standardize(corpus, "token_document")
#> ⚠ This function changes `corpus` in place!

# prepare corpus
prepare(corpus, strip_html_tags = FALSE)
#> ⚠ This function changes `cropus` in place!
get_text(corpus)
#> # A tibble: 2 x 2
#>   text                               document
#>   <chr>                                 <int>
#> 1 write write sentenc stuff document        1
#> 2 "hey write  document "                    2

# lexicon + lexical stats
(lexicon <- lexicon(corpus))
#> # A tibble: 6 x 2
#>   words        n
#>   <chr>    <int>
#> 1 stuff        1
#> 2 document     2
#> 3 write        3
#> 4 hey          1
#> 5 ""           2
#> 6 sentenc      1
lexical_frequency(corpus, "document")
#> [1] 0.2

# inverse index
inverse_index(corpus)
#> [1] 6

# dtm
m <- document_term_matrix(corpus)

# create func to easily add lexicon
bind_lexicon <- function(data){
  data %>% 
    as.data.frame() %>% 
    dplyr::bind_cols(
      lexicon %>% 
        dplyr::select(-n),
      .
    )
}

# term-frequency
tf(m) %>% bind_lexicon()
#> # A tibble: 6 x 3
#>   words       V1    V2
#>   <chr>    <dbl> <dbl>
#> 1 stuff      0     0.4
#> 2 document   0.2   0.2
#> 3 write      0     0.2
#> 4 hey        0.2   0  
#> 5 ""         0.2   0  
#> 6 sentenc    0.4   0.2

# tf-idf
tf_idf(m) %>% bind_lexicon()
#> # A tibble: 6 x 3
#>   words       V1    V2
#>   <chr>    <dbl> <dbl>
#> 1 stuff    0     0.277
#> 2 document 0     0    
#> 3 write    0     0.139
#> 4 hey      0.139 0    
#> 5 ""       0.139 0    
#> 6 sentenc  0     0

# bm-25
# https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/
bm_25(m) %>% bind_lexicon()
#> # A tibble: 6 x 3
#>   words       V1    V2
#>   <chr>    <dbl> <dbl>
#> 1 stuff    0     1.07 
#> 2 document 0.834 0.834
#> 3 write    0     1.48 
#> 4 hey      1.48  0    
#> 5 ""       1.48  0    
#> 6 sentenc  0.721 0.542

# sentiment
sentiment(corpus)
#> [1] 0.5834179 0.5475631

# summarise in 2 sentences
summarize(string_document(str), ns = 2L)
#> [1] "They <span>write</span>, it writes too!!!"
#> [2] "This is another sentence."

Latent Dirichlet Allocation

fit LDA on the gensimr data.

set_seed(42L)
#> Julia Object of type MersenneTwister.
#> MersenneTwister(UInt32[0x0000002a], Random.DSFMT.DSFMT_state(Int32[964434469, 1073036706, 1860149520, 1073503458, 1687169063, 1073083486, -399267803, 1072983952, -909620556, 1072836235  …  -293054293, 1073002412, -1300127419, 1073642642, 1917177374, -666058738, -337596527, 1830741494, 382, 0]), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], UInt128[0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000  …  0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000], 1002, 0)

data("corpus", package = "gensimr")
documents <- to_documents(corpus) # convert vector to documents

crps <- corpus(documents)
dtm <- document_term_matrix(crps)

# 2 topics
# 1K iterations
lda_data <- lda(dtm, 2L, 1000L)

# classification
lda_data$ntopics_ndocs
#>            [,1]      [,2]
#>  [1,] 1.0000000 0.0000000
#>  [2,] 0.1000000 0.9000000
#>  [3,] 0.6666667 0.3333333
#>  [4,] 1.0000000 0.0000000
#>  [5,] 0.0000000 1.0000000
#>  [6,] 0.0000000 1.0000000
#>  [7,] 0.0000000 1.0000000
#>  [8,] 0.0000000 1.0000000
#>  [9,] 0.0000000 1.0000000

mat <- dtm_matrix(dtm, "dense")

tfidf <- tf_idf(mat)

km <- kmeans(tfidf, centers = 2)

Hash trick

hash_func <- create_hash_function(10L)
hash("a", hash_func)
#> [1] 8
hash(doc) # doc has built-in has
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
#> [1,]    0    0    0    0    0    0    0    0    0     0     0     0     0
#>      [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
#> [1,]     0     0     0     0     1     0     0     0     1     0     0
#>      [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
#> [1,]     0     0     0     0     0     0     1     0     0     0     0
#>      [,36] [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46]
#> [1,]     0     0     0     0     0     0     0     0     0     0     0
#>      [,47] [,48] [,49] [,50] [,51] [,52] [,53] [,54] [,55] [,56] [,57]
#> [1,]     0     0     0     0     2     0     0     0     0     0     0
#>      [,58] [,59] [,60] [,61] [,62] [,63] [,64] [,65] [,66] [,67] [,68]
#> [1,]     0     0     0     0     0     0     0     0     0     0     0
#>      [,69] [,70] [,71] [,72] [,73] [,74] [,75] [,76] [,77] [,78] [,79]
#> [1,]     0     0     0     0     0     0     0     0     0     0     0
#>      [,80] [,81] [,82] [,83] [,84] [,85] [,86] [,87] [,88] [,89] [,90]
#> [1,]     0     0     0     0     0     0     0     0     0     0     0
#>      [,91] [,92] [,93] [,94] [,95] [,96] [,97] [,98] [,99] [,100]
#> [1,]     0     0     0     0     0     0     0     0     0      0

Naive Bayes Classifier

classes <- factor(c("financial", "legal"))
model <- init_naive_classifer(classes)

train <- tibble::tibble(
  text = c("this is financial doc", "this is legal doc"),
  labels = factor(c("financial", "legal"))
)

train_naive_classifier(model, train, text, labels)
#> ⚠ This function changes `model` in place!

test <- tibble::tibble(
  text = "this should be predicted as a legal document"
)
predict_class(model, test, text)
#> # A tibble: 1 x 2
#>   legal financial
#>   <dbl>     <dbl>
#> 1 0.667     0.333

Co-occurence Matrix

Plot method uses echarts4r

matrix <- coom(crps)
plot(matrix)

Name		Name	Last commit message	Last commit date
Latest commit History 120 Commits
R		R
data-raw		data-raw
data		data
docs		docs
man		man
pkgdown/favicon		pkgdown/favicon
vignettes		vignettes
.DS_Store		.DS_Store
.Rbuildignore		.Rbuildignore
.gitignore		.gitignore
.travis.yml		.travis.yml
DESCRIPTION		DESCRIPTION
LICENSE		LICENSE
LICENSE.md		LICENSE.md
NAMESPACE		NAMESPACE
README.Rmd		README.Rmd
README.md		README.md
_pkgdown.yml		_pkgdown.yml
index.Rmd		index.Rmd

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Licenses found

Repository files navigation

textanalysis

Installation

Setup

Basic Examples

Latent Dirichlet Allocation

Hash trick

Naive Bayes Classifier

Co-occurence Matrix

About

Licenses found

Releases

Packages

Languages

License

Licenses found

news-r/textanalysis

Folders and files

Latest commit

History

Repository files navigation

textanalysis

Installation

Setup

Basic Examples

Latent Dirichlet Allocation

Hash trick

Naive Bayes Classifier

Co-occurence Matrix

About

Topics

Resources

License

Licenses found

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages