Fundamentally change tokenization and tokens() constuctor methods

See Item 10 in NEWS.md
quanteda · Jan 20, 2020 · 5fd84e7 · 5fd84e7
1 parent 079fe99
commit 5fd84e7
Show file tree

Hide file tree

Showing 19 changed files with 650 additions and 636 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: quanteda
-Version: 1.9.9009
+Version: 1.9.9010
 Title: Quantitative Analysis of Textual Data
 Description: A fast, flexible, and comprehensive framework for 
     quantitative text analysis in R.  Provides functionality for corpus management,
@@ -51,6 +51,7 @@ Imports:
     spacyr,
     stopwords,
     stringi,
+    tokenizers,
     xml2,
     yaml,
     proxyC (>= 0.1.4)
@@ -167,6 +168,7 @@ Collate:
     'textstat_readability.R'
     'textstat_simil.R'
     'textstat_simil_old.R'
+    'tokenize.R'
     'tokens-methods-base.R'
     'tokens.R'
     'tokens_chunk.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -364,9 +364,13 @@ S3method(textstat_simil,default)
 S3method(textstat_simil,dfm)
 S3method(textstat_simil_old,default)
 S3method(textstat_simil_old,dfm)
+S3method(tokenize,character)
+S3method(tokenize,corpus)
+S3method(tokenize,default)
 S3method(tokens,character)
 S3method(tokens,corpus)
 S3method(tokens,default)
+S3method(tokens,list)
 S3method(tokens,tokens)
 S3method(tokens_chunk,tokens)
 S3method(tokens_compound,default)
@@ -529,6 +533,7 @@ export(textstat_readability)
 export(textstat_select)
 export(textstat_simil)
 export(textstat_simil_old)
+export(tokenize)
 export(tokens)
 export(tokens_chunk)
 export(tokens_compound)
@@ -620,9 +625,23 @@ importFrom(stats,resid)
 importFrom(stats,residuals)
 importFrom(stats,rstandard)
 importFrom(stats,sd)
+importFrom(stringi,stri_detect_charclass)
+importFrom(stringi,stri_detect_regex)
 importFrom(stringi,stri_length)
+importFrom(stringi,stri_locate_first_fixed)
+importFrom(stringi,stri_locate_last_fixed)
+importFrom(stringi,stri_replace_all_fixed)
+importFrom(stringi,stri_split_boundaries)
+importFrom(stringi,stri_split_fixed)
+importFrom(stringi,stri_split_regex)
+importFrom(stringi,stri_startswith_fixed)
+importFrom(stringi,stri_subset_regex)
+importFrom(stringi,stri_trans_nfc)
 importFrom(stringi,stri_trans_tolower)
 importFrom(stringi,stri_trans_toupper)
+importFrom(stringi,stri_trim_both)
+importFrom(tokenizers,tokenize_characters)
+importFrom(tokenizers,tokenize_sentences)
 importFrom(utils,combn)
 importFrom(utils,glob2rx)
 useDynLib(quanteda, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -32,8 +32,22 @@
 
 8.  All included data objects are upgraded to the new formats.  This includes the three corpus objects and the single dfm data object.
 
-9.  New print methods with new global options.  Similar to the extended printing options for dfm objects, printing of corpus objects now allows for brief summaries of the texts to be printed, and for the number of documents and the length of the previews to be controlled by new global options.
+9.  New print methods for core objects (corpus, tokens, dfm, dictionary) now exist, each with new global options to control the number of documents shown, as well as the length of a text snippet (corpus), the tokens (tokens), dfm cells (dfm), or keys and values (dictionary).
+
+10.  **quanteda** v2 implements major changes to the `tokens()` constructor.  These are designed to simplify the code and its maintenance in **quanteda**, to allow users to work with other (external) tokenizers, and to improve consistency across the tokens processing options.  Changes include:
+
+    -  A new method `tokens.list(x, ...)` constructs a `tokens` object from named list of characters, allowing users to tokenize texts using some other function (or package) such as `tokenize_words()`, `tokenize_sentences()`, or `tokenize_tweets()` from the **tokenizers** package, or the list returned by `spacyr::spacy_tokenize()`.  
+
+    -  All `remove_*` options to `tokens()` now remove them from tokens objects by calling `tokens.tokens()`, after consructing the object.  "Pre-processing" is now  actually post-processing using `tokens_*()` methods internally.  after a simple tokenization on word boundaries. This both improves performance and improves consistency in handling special characters (e.g. Twitter characters) across different tokenizer engines. (#1503, #1446, #1801)    
+
+    -  To maintain consistency with current behaviour, a new quanteda function named `tokenize()` now provides functionality similar to the pre-v2 `what = "word"` options.  The option `what` is removed from the function signature but still works, although its use is deprecated. 
 
+    -  The option `remove_twitter` has been replaced in `tokenize()` by `preserve_tags`, which preserves valid social media hashtags and usernames (using Twitter rules for validity) rather than removing the `#` and `@` punctuation characters if `remove_punct = TRUE`.
+
+    - The option `remove_separators` is removed and deprecated.
+
+    - The option `remove_hyphens` is removed and deprecated, but replaced by `split_infix_hyphens = FALSE`.  This preserves infix (internal) hyphens rather than splitting them.
+
 ## Bug fixes and stability enhancements
 
 *  docnames now enforced to be character (formerly, could be numeric for some objects).

diff --git a/R/corpus_segment.R b/R/corpus_segment.R
@@ -92,7 +92,8 @@
 #'                            pattern_position = "after", extract_pattern = FALSE)
 #' cbind(texts(corpseg3), docvars(corpseg3))
 #' 
-#' @import stringi
+#' @importFrom stringi stri_trim_both stri_replace_all_fixed stri_locate_last_fixed 
+#'   stri_locate_first_fixed
 #' @export
 corpus_segment <- function(x, pattern = "##*",
                            valuetype = c("glob", "regex", "fixed"),
@@ -203,8 +204,8 @@ segment_texts <- function(x, pattern = NULL, valuetype = "regex",
                           omit_empty = TRUE, what = "other", ...){
 
     # normalize EOL
-    x <- stri_replace_all_fixed(x, "\r\n", "\n") # Windows
-    x <- stri_replace_all_fixed(x, "\r", "\n") # Old Macintosh
+    x <- stringi::stri_replace_all_fixed(x, "\r\n", "\n") # Windows
+    x <- stringi::stri_replace_all_fixed(x, "\r", "\n") # Old Macintosh
 
     # use preset regex pattern
     if (what == 'paragraphs') {
@@ -219,17 +220,17 @@ segment_texts <- function(x, pattern = NULL, valuetype = "regex",
 
         if (valuetype == "glob") {
             # treat as fixed if no glob character is detected
-            if (!any(stri_detect_charclass(pattern, c("[*?]")))) {
+            if (!any(stringi::stri_detect_charclass(pattern, c("[*?]")))) {
                 valuetype <- "fixed"
             } else {
                 pattern <- escape_regex(pattern)
-                pattern <- stri_replace_all_fixed(pattern, '*', '(\\S*)')
-                pattern <- stri_replace_all_fixed(pattern, '?', '(\\S)')
+                pattern <- stringi::stri_replace_all_fixed(pattern, '*', '(\\S*)')
+                pattern <- stringi::stri_replace_all_fixed(pattern, '?', '(\\S)')
                 valuetype <- "regex"
             }
         }
 
-        x <- stri_trim_both(x)
+        x <- stringi::stri_trim_both(x)
         if (valuetype == "fixed") {
             if (pattern_position == "after") {
                 x <- stri_replace_all_fixed(x, pattern, stri_c(pattern, "\uE000"),

diff --git a/R/nfunctions.R b/R/nfunctions.R
@@ -187,12 +187,11 @@ ntype.tokens <- function(x, ...) {
 #' 
 #' Return the count of sentences in a corpus or character object.
 #' @param x a character or [corpus] whose sentences will be counted
-#' @param ... additional arguments passed to [tokens()]
-#' @note `nsentence()` relies on the boundaries definitions in the
-#'   \pkg{stringi} package (see [stri_opts_brkiter][stringi::stri_opts_brkiter]).  It does not
+#' @note `nsentence()` relies on the boundaries definitions in the \pkg{stringi}
+#'   package (see [stri_opts_brkiter][stringi::stri_opts_brkiter]).  It does not
 #'   count sentences correctly if the text has been transformed to lower case,
-#'   and for this reason `nsentence()` will issue a warning if it detects
-#'   all lower-cased text.
+#'   and for this reason `nsentence()` will issue a warning if it detects all
+#'   lower-cased text.
 #' @return count(s) of the total sentences per text
 #' @examples
 #' # simple example
@@ -201,34 +200,35 @@ ntype.tokens <- function(x, ...) {
 #'          text3 = "Mr. Jones has a PhD from the LSE.  Second sentence.")
 #' nsentence(txt)
 #' @export
-nsentence <- function(x, ...) {
+nsentence <- function(x) {
     UseMethod("nsentence")
 }
 
 #' @export
-nsentence.default <- function(x, ...) {
+nsentence.default <- function(x) {
     stop(friendly_class_undefined_message(class(x), "nsentence"))
 }
 
 #' @export
-nsentence.character <- function(x, ...) {
+#' @importFrom tokenizers tokenize_sentences
+nsentence.character <- function(x) {
     upcase <-
         try(any(stringi::stri_detect_charclass(x, "[A-Z]")), silent = TRUE)
     if (!is.logical(upcase)) {
         # warning("Input text contains non-UTF-8 characters.")
     } else if (!upcase)
         warning("nsentence() does not correctly count sentences in all lower-cased text")
-    lengths(tokens(x, what = "sentence", ...))
+    lengths(tokenizers::tokenize_sentences(x))
 }
 
 #' @export
-nsentence.corpus <- function(x, ...) {
+nsentence.corpus <- function(x) {
     x <- as.corpus(x)
-    nsentence(texts(x), ...)
+    nsentence(texts(x))
 }
 
 #' @export
-nsentence.tokens <- function(x, ...) {
+nsentence.tokens <- function(x) {
     x <- as.tokens(x)
     if (attr(x, "what") != "sentence")
         stop("nsentence on a tokens object only works if what = \"sentence\"")

diff --git a/R/nscrabble.R b/R/nscrabble.R
@@ -38,7 +38,7 @@ nscrabble.character <- function(x, FUN = sum) {
                              values = as.integer(rep(c(1, 3, 3, 2, 1, 4, 2, 4, 1, 8, 5, 1, 3, 1, 1, 3, 10, 1, 1, 1, 1, 4, 4, 8, 4, 10), 2)))
     setkey(letterVals, letter)
 
-    textChars <- as.list(tokens(x, what = "character", remove_punct = TRUE))
+    textChars <- tokenizers::tokenize_characters(x, lowercase = TRUE, strip_non_alphanum = TRUE)
     textDT <- data.table(docIndex = rep(seq_along(textChars), lengths(textChars)),
                          Char = unlist(textChars, use.names = FALSE))
     setkey(textDT, Char)

diff --git a/R/quanteda_options.R b/R/quanteda_options.R
@@ -141,8 +141,8 @@ get_options_default <- function(){
                  print_corpus_max_ndoc = 0L,
                  print_corpus_max_nchar = 60L,
                  print_corpus_summary = TRUE,
-                 print_tokens_max_ndoc = 0L,
-                 print_tokens_max_ntoken = 20L,
+                 print_tokens_max_ndoc = 6L,
+                 print_tokens_max_ntoken = 10L,
                  print_tokens_summary = TRUE,
                  base_docname = "text",
                  base_featname = "feat",

diff --git a/R/textstat_lexdiv.R b/R/textstat_lexdiv.R
@@ -224,7 +224,7 @@ textstat_lexdiv.tokens <-
     tokens_only_measures <-  c("MATTR", "MSTTR")
 
     # additional token handling
-    x <- tokens(x, remove_hyphens = remove_hyphens,
+    x <- tokens(x, split_infix_hyphens = remove_hyphens,
                 remove_numbers = remove_numbers,
                 remove_symbols = remove_symbols)
 

diff --git a/R/tokenize.R b/R/tokenize.R
@@ -0,0 +1,168 @@
+#' quanteda default tokenizer
+#' 
+#' Default \pkg{quanteda} tokenizer to implement full set of options, for a
+#' character or [corpus] input.
+#' @param x a character or [corpus] object
+#' @inheritParams tokens
+#' @param ... not used
+#' @return a (uniquely) named list of characters
+#' @keywords tokens
+#' @importFrom stringi stri_split_regex stri_detect_regex stri_detect_charclass
+#'   stri_split_boundaries stri_subset_regex stri_split_fixed
+#' @export
+#' @examples
+#' txt <- c(doc1 = "Tweet https://quanteda.io using @quantedainit and #rstats.",
+#'          doc2 = "The £1,000,000 question.",
+#'          doc3 = "毎日#quantedaを使用してください！")
+#' tokenize(txt)
+#' tokenize(txt, remove_symbols = TRUE, remove_punct = TRUE)
+#' tokenize(txt, remove_symbols = TRUE, remove_punct = TRUE, remove_url = TRUE)
+#' tokenize(txt, remove_symbols = TRUE, remove_punct = TRUE, preserve_tags = TRUE)
+#' tokenize(txt, remove_symbols = FALSE, remove_punct = TRUE, remove_numbers = TRUE)
+tokenize <- function(x,
+                     remove_punct = FALSE,
+                     remove_symbols = FALSE,
+                     remove_numbers = FALSE,
+                     remove_url = FALSE,
+                     preserve_tags = !remove_punct,
+                     split_infix_hyphens = FALSE, ...) {
+    UseMethod("tokenize")
+}
+
+#' @rdname tokenize
+#' @noRd
+#' @export
+tokenize.default <- function(x, ...) {
+    stop(friendly_class_undefined_message(class(x), "tokenize"))
+}
+
+#' @rdname tokenize
+#' @noRd
+#' @export
+tokenize.corpus <- function(x, ...) {
+    tokenize(texts(x), ...)
+}
+
+#' @rdname tokenize
+#' @noRd
+#' @export
+tokenize.character <- function(x, 
+                               remove_punct = FALSE,
+                               remove_symbols = FALSE,
+                               remove_numbers = FALSE,
+                               remove_url = FALSE,
+                               preserve_tags = !remove_punct,
+                               split_infix_hyphens = FALSE,
+                               ...) {
+
+    check_dots(list(...), names(formals("tokenize")))
+    check_input(x)
+    named <- names(x)
+
+    # initial split on white space
+    out <- stri_split_regex(x, "[\\p{Z}\\p{C}]+")
+
+    # get document indexes to vectorize tokens
+    doc_lengths <- cumsum(lengths(out))
+    docindex <- c(0, doc_lengths)
+    # convert the list into a vector - avoids all those mapplys
+    out <- unlist(out)
+
+    # intialize index vectors
+    index_symbols <- index_tag <- index_urls <- index_infixhyphens <- 
+        rep(FALSE, length(out))
+    # initialize regex for removals
+    regex_to_remove <- character()
+
+    if (!split_infix_hyphens) {
+        # anything with an interior hyphen
+        index_infixhyphens <- stri_detect_regex(out, ".+\\p{Pd}.+")
+    }
+
+    if (preserve_tags) {
+        # get the index of social media #hashtags and @usernames
+        index_tag <- stri_detect_regex(out, "^#[A-Za-z]+\\w*|^@\\w+")
+    }
+
+    if (!remove_symbols) {
+        # anything with a symbol not also a tag
+        index_symbols <- stri_detect_charclass(out, "\\p{S}") && !index_tag
+    }
+
+    # handle URLs and email addresses
+    index_url <- stri_detect_regex(out, "^((https{0,1}|s{0,1}ftp)://)|(\\w+@\\w+)")
+    if (remove_url) {
+        out[index_url] <- ""
+        regex_to_remove = c(regex_to_remove, "^$") # remove the blanks later
+    }
+
+    # apply ICU word tokenizer to everything not indexed for preservation
+    index_to_split <- !(index_url | index_tag | index_symbols | index_infixhyphens)
+    out[index_to_split] <- stri_split_boundaries(out[index_to_split], type = "word")
+
+    # convert the vector back to a list
+    out <- split(out,
+                 cut(
+                   seq_along(out),
+                   docindex,
+                   include.lowest = FALSE,
+                   labels = named
+                 ))
+    # handle nested lists from 2nd stage stri_split_boundaries() call
+    out <- lapply(out, unlist)
+
+    # final removals
+    if (remove_punct)
+        regex_to_remove <- c(regex_to_remove, "^\\p{P}$")
+    if (remove_symbols)
+        regex_to_remove <- c(regex_to_remove, "^\\p{S}$")
+    if (remove_numbers)
+        regex_to_remove <- c(regex_to_remove, "^\\p{Sc}{0,1}(\\p{N}+[,.]{0,1}\\p{N}+)+\\p{Sc}{0,1}$")
+    if (length(regex_to_remove) > 0)
+        out <- lapply(out, function(toks) 
+            stri_subset_regex(toks, paste(regex_to_remove, collapse = "|"), negate = TRUE))
+
+    # reattach names
+    names(out) <- named
+
+    out
+}
+
+# internal tokenizers for legacy handling of what argument ----------
+
+#' @rdname tokenize
+#' @importFrom tokenizers tokenize_characters
+tokenize_character <- function(x) {
+    tokenizers::tokenize_characters(x, strip_non_alphanum = FALSE, simplify = FALSE)
+}
+
+#' @rdname tokenize
+#' @importFrom tokenizers tokenize_sentences
+tokenize_sentence <- function(x) {
+    tokenizers::tokenize_sentences(x)
+}
+
+#' @rdname tokenize
+tokenize_fasterword <- function(x) {
+    stringi::stri_split_regex(x, "[\\p{Z}\\p{C}]+")
+}
+
+tokenize_fastestword <- function(x) {
+    stringi::stri_split_fixed(x, " ")
+}
+
+
+# utility functions ----------
+
+check_input <- function(x) {
+  check_character <- is.character(x) |
+  if (is.list(x)) {
+       check_list <- all(vapply(x, is.character, logical(1))) &
+         all(vapply(x, length, integer(1)) == 1L)
+  } else {
+    check_list <- FALSE
+  }
+  if (!(check_character | check_list))
+    stop("Input must be a character vector of any length or a list of character\n",
+         "  vectors, each of which has a length of 1.")
+}