Skip to content

Commit

Permalink
The guts of a pmi function done
Browse files Browse the repository at this point in the history
  • Loading branch information
cokelly committed Apr 11, 2017
1 parent 50305f8 commit 2126a07
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 40 deletions.
1 change: 1 addition & 0 deletions R/CollDBclass.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ collDB <- setClass("collDB",
right_locs = "list",
node = "character",
node_hash = "character",
node_recurrence = "numeric",
doc_table = "tbl_df"),
contains = "list")
31 changes: 31 additions & 0 deletions R/count_collocates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#' A simple function for counting collocates
#'
#' @param collsDB A collocates list, combined with relevant counts, generated by save_collocates
#'
count_collocates <- function(collsDB){
# Convert duplicates, locations below 1 and above the word count to NA
left_locs <- collsDB[[1]] %>% unlist(.)
right_locs <- collsDB[[1]] %>% unlist(.)
all_locs <- c(left_locs, right_locs)
# Convert duplicates, locations below 1 and above the word count to NA
all_locs[duplicated(all_locs)] <- NA
all_locs[all_locs < 1] <- NA
all_locs[all_locs > nrow(collsDB$doc_table)] <- NA

# Convert locations into words
coll_counts <- collsDB$doc_table[all_locs,] %>%
table(.) %>%
tibble(word = names(.), coll_count = .)

return(coll_counts)
}

# Leaving this here for offering locational collocate data
#left_counts <- doc$doc_table[left_locs, ] %>%
# table(.) %>%
# tibble(word = names(.), count = .)

#right_counts <- doc$doc_table[right_locs, ] %>%
# table(.) %>%
# tibble(word = names(.), count = .)
#}
49 changes: 29 additions & 20 deletions R/pmi.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,41 @@
#' @param remove_stops OPTIONAL
#'
#'
pmi <- function(document, window, node, floor, remove_stops = TRUE){
pmi <- function(document, floor = 3, window, node, remove_stops = TRUE){
# Test to see if the document is a collDB class
# That is, that it has already gone through save_collocates
if(class(document) != "collDB"){
doc <- save_collocates(document = document,
window = window,
node = node,
remove_stops = remove_stops)
}
# Convert duplicates, locations below 1 and above the word count to NA
left_locs <- doc[[1]] %>% unlist(.)
left_locs[duplicated(left_locs)] <- NA
left_locs[left_locs < 1] <- NA
left_locs[left_locs > nrow(doc$doc_table)] <- NA

all_locs <- c(left_locs, right_locs)
# Convert locations into words
all_counts <- doc$doc_table[all_locs,] %>%
} else {doc <- document}

# Count the collocates

coll_counts <- count_collocates(doc)

coll_counts <- filter(coll_counts, coll_count > floor)

# Count all relevant words in the whole document
# Note, for efficiency sake I only count the words that are in the collocate window
all_words_counts <- filter(doc$doc_table, word %in% coll_counts$word) %>%
table(.) %>%
tibble(word = names(.), count = .)
# Leaving this here for offering locational collocate data
#left_counts <- doc$doc_table[left_locs, ] %>%
# table(.) %>%
# tibble(word = names(.), count = .)

#right_counts <- doc$doc_table[right_locs, ] %>%
# table(.) %>%
# tibble(word = names(.), count = .)
tibble(word = names(.), all_count = .)

counts <- left_join(coll_counts, all_words_counts, by = "word")

# Calculate the PMI
# Calculate pmi

pmi <- tibble(phrase = coll_counts$word,
collocate_freqs = coll_counts$coll_count,
doc_freqs = all_words_counts$all_count,
probx = doc$node_recurrence/nrow(doc$doc_table),
proby = doc_freqs/nrow(doc$doc_table),
probxy = collocate_freqs/nrow(doc$doc_table),
pmi = log(probxy/(probx*proby)),
npmi = pmi/-log(probxy))

return(pmi)
}
60 changes: 40 additions & 20 deletions R/save_collocates.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,23 @@
#' @importFrom digest sha1
#' @keywords collocates kwic

save_collocates <- function(document, window, node, remove_stops = TRUE){
save_collocates <- function(document, window, node, remove_stops = TRUE, remove_numerals = TRUE){
node_length <- length(unlist(strsplit(node, " ")))
# Test to see if the node phrase is larger than the window
if(node_length > ((window*2)+1)){ # longer than twice the window plus the keyword
stop("Error: the node phrase is longer than the kwic window")
}
# Remove numerals
if(remove_numerals == TRUE){
document <- str_replace_all(document, "[0-9]", "")
}

document <- tolower(document)
# Hash the node to to create a single phrase (and ensure stopwords aren't removed)

# Hash the node to to create a single phrase (and ensure stopwords contained in the
# node aren't removed)
node1 <- sha1(node)
document <- str_replace_all(document, node, node1)
document <- gsub(x = document, pattern = paste("\\b", node, "\\b", sep = ""), replacement = node1)
# Unnest
word.t <- tibble(document) %>%
unnest_tokens(word,
Expand All @@ -29,31 +36,44 @@ save_collocates <- function(document, window, node, remove_stops = TRUE){
# If required remove stopwords
if(remove_stops == TRUE){
data("stop_words")
x <- which(!(word.t$word %in% stop_words))
word.t <- word.t[x,]
word.t <- filter(word.t, !(word %in% stop_words$word))
#x <- which(!(word.t$word %in% stop_words$word))
#word.t <- word.t[x,]
}
# Get locations of node
node_loc <- which(word.t == node1)
# If there are no matches, just return a vector of NAs
if(length(node_loc) == 0){
collocate_locs <- list(rep(NA, times=(window)), rep(NA, times=(window)), node, node1, word.t)
collocate_locs <- list(rep(NA, times=(window)),
rep(NA, times=(window)),
node,
node1,
word.t)

} else {
left_locs <- lapply(node_loc, function(x) ((x-window):(x-1)))
right_locs <- lapply(node_loc, function(x) ((x+1):(x+window)))
# No need for a tibble here: a list of vectors would be more efficient.
# A tibble would only be required if we were going to be displaying the locs, ubt we're not
#collocate_locs <- lapply(seq_along(1:length(left_locs)), function(x) as_tibble(t(min(left_locs[[x]]):max(right_locs[[x]]))))
#collocate_locs <- bind_rows(collocate_locs)
collocate_locs <- list(left_locs, right_locs, node, node1, word.t)
left_locs <- lapply(node_loc, function(x)
((x-window):(x-1)))
right_locs <- lapply(node_loc,
function(x)
((x+1):(x+window)))
# convert any node_locs into NA
left_locs <- lapply(left_locs,
function(x)
unlist(sapply(x,
function(a)
ifelse(a %in% node_loc, yes = a <- NA, no = a <- a))))

right_locs <- lapply(right_locs,
function(x)
unlist(sapply(x,
function(a)
ifelse(a %in% node_loc, yes = a <- NA, no = a <- a))))

collocate_locs <- list(left_locs, right_locs, node, node1, length(node_loc), word.t)
}
names(collocate_locs) <- c("left_locs", "right_locs", "node", "node_hash", "doc_table")
#left_cols <- paste("L", seq_along(window:1), sep="")
#right_cols <- paste("R", seq_along(1:window), sep="")
#name_cols <- c(left_cols, "node", right_cols)
#colnames(collocate_locs) <- name_cols

as(object = collocate_locs, Class = "collDB")
names(collocate_locs) <- c("left_locs", "right_locs", "node", "node_hash", "node_recurrence", "doc_table")

collocate_locs <- as(object = collocate_locs, Class = "collDB")

return(collocate_locs)
}

0 comments on commit 2126a07

Please sign in to comment.