The guts of a pmi function done

cokelly · Apr 11, 2017 · 2126a07 · 2126a07
1 parent 50305f8
commit 2126a07
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 40 deletions.
diff --git a/R/CollDBclass.R b/R/CollDBclass.R
@@ -8,5 +8,6 @@ collDB <- setClass("collDB",
                              right_locs = "list", 
                              node = "character", 
                              node_hash = "character", 
+                             node_recurrence = "numeric",
                              doc_table = "tbl_df"), 
                    contains = "list")
diff --git a/R/count_collocates.R b/R/count_collocates.R
@@ -0,0 +1,31 @@
+#' A simple function for counting collocates
+#' 
+#' @param collsDB A collocates list, combined with relevant counts, generated by save_collocates
+#' 
+count_collocates <- function(collsDB){
+      # Convert duplicates, locations below 1 and above the word count to NA
+left_locs <- collsDB[[1]] %>% unlist(.)
+right_locs <- collsDB[[1]] %>% unlist(.)
+all_locs <- c(left_locs, right_locs)
+# Convert duplicates, locations below 1 and above the word count to NA
+all_locs[duplicated(all_locs)] <- NA
+all_locs[all_locs < 1] <- NA
+all_locs[all_locs > nrow(collsDB$doc_table)] <- NA
+
+# Convert locations into words
+coll_counts <- collsDB$doc_table[all_locs,] %>%
+      table(.) %>%
+      tibble(word = names(.), coll_count = .)
+
+return(coll_counts)
+}
+
+# Leaving this here for offering locational collocate data
+#left_counts <- doc$doc_table[left_locs, ] %>%
+#     table(.) %>% 
+#    tibble(word = names(.), count = .)  
+
+#right_counts <- doc$doc_table[right_locs, ] %>%
+#     table(.) %>% 
+#    tibble(word = names(.), count = .)
+#}
diff --git a/R/pmi.R b/R/pmi.R
@@ -7,32 +7,41 @@
 #' @param remove_stops OPTIONAL
 #' 
 #' 
-pmi <- function(document, window, node, floor, remove_stops = TRUE){
+pmi <- function(document, floor = 3, window, node, remove_stops = TRUE){
       # Test to see if the document is a collDB class
       # That is, that it has already gone through save_collocates
       if(class(document) != "collDB"){
             doc <- save_collocates(document = document, 
                                    window = window, 
                                    node = node, 
                                    remove_stops = remove_stops)
-      }
-      # Convert duplicates, locations below 1 and above the word count to NA
-      left_locs <- doc[[1]] %>% unlist(.)
-      left_locs[duplicated(left_locs)] <- NA
-      left_locs[left_locs < 1] <- NA
-      left_locs[left_locs > nrow(doc$doc_table)] <- NA
-
-      all_locs <- c(left_locs, right_locs)
-      # Convert locations into words
-      all_counts <- doc$doc_table[all_locs,] %>%
+      } else {doc <- document}
+
+      # Count the collocates
+
+      coll_counts <- count_collocates(doc)
+
+      coll_counts <- filter(coll_counts, coll_count > floor)
+
+      # Count all relevant words in the whole document
+      # Note, for efficiency sake I only count the words that are in the collocate window
+      all_words_counts <- filter(doc$doc_table, word %in% coll_counts$word) %>% 
             table(.) %>%
-            tibble(word = names(.), count = .)
-      # Leaving this here for offering locational collocate data
-      #left_counts <- doc$doc_table[left_locs, ] %>%
-       #     table(.) %>% 
-        #    tibble(word = names(.), count = .)  
-
-      #right_counts <- doc$doc_table[right_locs, ] %>%
-       #     table(.) %>% 
-        #    tibble(word = names(.), count = .)
+            tibble(word = names(.), all_count = .)
+
+      counts <- left_join(coll_counts, all_words_counts, by = "word")
+
+      # Calculate the PMI
+      # Calculate pmi
+
+      pmi <- tibble(phrase = coll_counts$word, 
+            collocate_freqs = coll_counts$coll_count, 
+            doc_freqs = all_words_counts$all_count,
+            probx = doc$node_recurrence/nrow(doc$doc_table),
+            proby = doc_freqs/nrow(doc$doc_table),
+            probxy = collocate_freqs/nrow(doc$doc_table),
+            pmi = log(probxy/(probx*proby)),
+            npmi = pmi/-log(probxy))
+
+      return(pmi)
 }
diff --git a/R/save_collocates.R b/R/save_collocates.R
@@ -10,16 +10,23 @@
 #' @importFrom digest sha1
 #' @keywords collocates kwic
 
-save_collocates <- function(document, window, node, remove_stops = TRUE){
+save_collocates <- function(document, window, node, remove_stops = TRUE, remove_numerals = TRUE){
       node_length <- length(unlist(strsplit(node, " ")))
       # Test to see if the node phrase is larger than the window
       if(node_length > ((window*2)+1)){ # longer than twice the window plus the keyword
             stop("Error: the node phrase is longer than the kwic window")
       }
+      # Remove numerals
+      if(remove_numerals == TRUE){
+            document <- str_replace_all(document, "[0-9]", "")      
+      }
+
       document <- tolower(document)
-      # Hash the node to to create a single phrase (and ensure stopwords aren't removed)
+
+      # Hash the node to to create a single phrase (and ensure stopwords contained in the 
+      # node aren't removed)
       node1 <- sha1(node)
-      document <- str_replace_all(document, node, node1)
+      document <- gsub(x = document, pattern = paste("\\b", node, "\\b", sep = ""), replacement = node1)
       # Unnest
       word.t <- tibble(document) %>% 
             unnest_tokens(word, 
@@ -29,31 +36,44 @@ save_collocates <- function(document, window, node, remove_stops = TRUE){
       # If required remove stopwords
       if(remove_stops == TRUE){
             data("stop_words")
-            x <- which(!(word.t$word %in% stop_words))
-            word.t <- word.t[x,]
+            word.t <- filter(word.t, !(word %in% stop_words$word))
+            #x <- which(!(word.t$word %in% stop_words$word))
+            #word.t <- word.t[x,]
       }
       # Get locations of node
       node_loc <- which(word.t == node1)
       # If there are no matches, just return a vector of NAs
       if(length(node_loc) == 0){
-            collocate_locs <- list(rep(NA, times=(window)), rep(NA, times=(window)), node, node1, word.t)
+            collocate_locs <- list(rep(NA, times=(window)), 
+                                   rep(NA, times=(window)), 
+                                   node, 
+                                   node1, 
+                                   word.t)
 
       } else {
-            left_locs <- lapply(node_loc, function(x) ((x-window):(x-1)))
-            right_locs <- lapply(node_loc, function(x) ((x+1):(x+window)))
-            # No need for a tibble here: a list of vectors would be more efficient. 
-            # A tibble would only be required if we were going to be displaying the locs, ubt we're not
-            #collocate_locs <- lapply(seq_along(1:length(left_locs)), function(x) as_tibble(t(min(left_locs[[x]]):max(right_locs[[x]]))))
-            #collocate_locs <- bind_rows(collocate_locs)
-            collocate_locs <- list(left_locs, right_locs, node, node1, word.t)
+            left_locs <- lapply(node_loc, function(x) 
+                  ((x-window):(x-1)))
+            right_locs <- lapply(node_loc, 
+                                 function(x) 
+                                       ((x+1):(x+window)))
+            # convert any node_locs into NA
+            left_locs <- lapply(left_locs, 
+                        function(x) 
+                              unlist(sapply(x,
+                                            function(a) 
+                                                  ifelse(a %in% node_loc, yes = a <- NA, no = a <- a))))
+
+            right_locs <- lapply(right_locs, 
+                        function(x) 
+                              unlist(sapply(x,
+                                          function(a) 
+                                                ifelse(a %in% node_loc, yes = a <- NA, no = a <- a))))
+
+            collocate_locs <- list(left_locs, right_locs, node, node1, length(node_loc), word.t)
       }
-      names(collocate_locs) <- c("left_locs", "right_locs", "node", "node_hash", "doc_table")
-      #left_cols <- paste("L", seq_along(window:1), sep="")
-      #right_cols <- paste("R", seq_along(1:window), sep="")
-      #name_cols <- c(left_cols, "node", right_cols)
-      #colnames(collocate_locs) <- name_cols
-
-      as(object = collocate_locs, Class = "collDB")
+      names(collocate_locs) <- c("left_locs", "right_locs", "node", "node_hash", "node_recurrence", "doc_table")
+
+      collocate_locs <- as(object = collocate_locs, Class = "collDB")
 
       return(collocate_locs)     
 }