From 4f51b468f4aed3bc4e535178cca6b909955f0349 Mon Sep 17 00:00:00 2001 From: Zachary Foster Date: Thu, 12 Dec 2024 17:31:48 -0800 Subject: [PATCH] added gna_verifier to replace gnr_resolve Relates to #940 --- NAMESPACE | 1 + R/gna_verifier.R | 191 ++++++++++++++++++++++++++++++++++++++++++++ man/gna_verifier.Rd | 69 ++++++++++++++++ 3 files changed, 261 insertions(+) create mode 100644 R/gna_verifier.R create mode 100644 man/gna_verifier.Rd diff --git a/NAMESPACE b/NAMESPACE index 9b0c52ac..03aea40b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -245,6 +245,7 @@ export(getkey) export(gisd_isinvasive) export(gna_parse) export(gna_search) +export(gna_verifier) export(gni_details) export(gni_parse) export(gni_seach) diff --git a/R/gna_verifier.R b/R/gna_verifier.R new file mode 100644 index 00000000..f45fc602 --- /dev/null +++ b/R/gna_verifier.R @@ -0,0 +1,191 @@ +#' Verify a list of scientific names against biodiversity data-sources. +#' +#' This service parses incoming names, executes exact or fuzzy matching as +#' required, and returns the best-scored result. Optionally, it can also return +#' matches from data-sources selected by a user. +#' +#' @export +#' +#' @param names A `character` vector of taxon names to verify. +#' @param data_sources A `character` or `integer` vector with numbers +#' corresponding to data sources. See [gna_data_sources()] for a list of +#' available options. +#' @param all_matches When `TRUE`, return all found matches, not only the best +#' one. Multiple results are returned in results. These results are sorted by +#' matching quality, the first result is the same as bestResult. +#' @param capitalize When `TRUE`, capitalize the first letter of a name-string. +#' @param species_group When `TRUE`, expands the search to species group where +#' applicable. +#' @param fuzzy_uninomial When `TRUE`, allows fuzzy matching for uninomial +#' names. +#' @param stats When `TRUE`, finds out a kingdom and a taxon (main taxon) that +#' contain most names. It only takes in account the names matched to the +#' Catalogue of Life entries. This option is ignored, if the Catalogue of Life +#' is not included in data-sources. +#' @param main_taxon_threshold A `numeric` vector from 0.5 to 1. This sets the +#' minimal percentage for the main taxon discovery. +#' @param output_type A `character` vector of length 1, one of `table`, `list`, +#' `json`, indicating the format of the output. The tabular output only +#' contains values that consistently appear in all results, so `list` or +#' `json` output can have additional information. For `list` and `json` +#' outputs, only values for unique taxon names are returned, but the `table` +#' output has rows that correspond 1-1 with the input data. +#' @param ... Curl options passed on to [crul::HttpClient] +#' +#' @return Depends on the value of the `output_type` option +#' +#' @author Zachary S.L. Foster +#' +#' @examples \dontrun{ +#' gna_verifier(c("Helianthus annuus", "Homo saapiens")) +#' gna_verifier(c("Helianthus annuus", "Homo saapiens"), all_matches = TRUE) +#' } +gna_verifier <- function( + names, + data_sources = c(1, 12), + all_matches = FALSE, + capitalize = FALSE, + species_group = FALSE, + fuzzy_uninomial = FALSE, + stats = FALSE, + main_taxon_threshold = 0.5, + output_type = 'table', + ... +) { + # Parse and verify input options + data_sources <- as.character(data_sources) + is_number <- grepl(data_sources, pattern = '[0-9]+') + if (any(! is_number) || length(data_sources) == 0) { + stop(call. = FALSE, 'The `data_sources` input must be a vector of numbers with at least one value.') + } + data_sources <- paste0(data_sources, collapse = '|') + + check_if_logical <- function(value, name) { + if (length(value) != 1 || is.na(value) || ! is.logical(value)) { + stop(call. = FALSE, 'The `', name, '` input must be a TRUE/FALSE vector of length 1.') + } + } + check_if_logical(all_matches, 'all_matches') + check_if_logical(capitalize, 'capitalize') + check_if_logical(species_group, 'species_group') + check_if_logical(fuzzy_uninomial, 'fuzzy_uninomial') + check_if_logical(stats, 'stats') + + if (length(main_taxon_threshold) != 1 || ! is.numeric(main_taxon_threshold) || main_taxon_threshold < 0.5 || main_taxon_threshold > 1) { + stop(call. = FALSE, 'The `main_taxon_threshold` input must be a single number between 0.5 to 1') + } + + output_type <- match.arg(output_type, c('table', 'list', 'json')) + + # Convert input to unique values to avoid redundant API wprk + unique_names <- unique(names) + + # Format the API GET request + base_url <- 'https://verifier.globalnames.org/' + args <- c( + data_sources = paste0(data_sources, collapse = '|'), + all_matches = tolower(as.character(all_matches)), + capitalize = tolower(as.character(capitalize)), + species_group = tolower(as.character(species_group)), + fuzzy_uninomial = tolower(as.character(fuzzy_uninomial)), + stats = tolower(as.character(stats)) + ) + formatted_args <- paste0(paste0(names(args), '=', args), collapse = '&') + formatted_path <- paste0( + 'api/v1/verifications/', + paste0(unique_names, collapse = '|'), + '?', formatted_args + ) + + # Make and parse API call + api <- crul::HttpClient$new(base_url, headers = tx_ual, opts = list(...)) + response <- api$get(path = formatted_path) + response$raise_for_status() + response_json <- response$parse("UTF-8") + if (output_type == 'json') { + return(response_json) + } + response_data <- jsonlite::fromJSON(response_json, FALSE) + if (output_type == 'list') { + return(response_data) + } + + # Reformat response data to a table + used_cols <- c( + 'submittedName', + 'dataSourceId', + 'dataSourceTitleShort', + 'curation', + 'recordId', + 'outlink', + 'entryDate', + 'sortScore', + 'matchedNameID', + 'matchedName', + 'matchedCardinality', + 'matchedCanonicalSimple', + 'matchedCanonicalFull', + 'currentRecordId', + 'currentNameId', + 'currentName', + 'currentCardinality', + 'currentCanonicalSimple', + 'currentCanonicalFull', + 'taxonomicStatus', + 'isSynonym', + 'editDistance', + 'stemEditDistance', + 'matchType', + 'cardinalityScore', + 'infraSpecificRankScore', + 'fuzzyLessScore', + 'curatedDataScore', + 'authorMatchScore', + 'acceptedNameScore', + 'parsingQualityScore' + ) + convert_entry_to_row <- function(x, input_name) { + output <- c(input_name, unlist(x)) + names(output)[1] <- 'submittedName' + parsed_names <- vapply(strsplit(names(output), split = '\\.'), + function(y) y[length(y)], FUN.VALUE = character(1)) + names(output) <- parsed_names + return(as.data.frame(as.list(output[used_cols]))) + } + if (all_matches) { + response_table <- do.call(rbind, lapply(response_data$names, function(x) { + do.call(rbind, lapply(x$results, function(y) convert_entry_to_row(y, x$name))) + })) + } else { + response_table <- do.call(rbind, lapply(response_data$names, function(x) { + convert_entry_to_row(x$bestResult, x$name) + })) + } + + # Reduplicate rows to match input + split_table <- split(response_table, response_table$submittedName) + submitted_names <- vapply(split_table, function(x) x$submittedName[1], FUN.VALUE = character(1)) + response_table <- do.call(rbind, split_table[match(names, submitted_names)]) + row.names(response_table) <- NULL + + # Format table to be more useful to the user + numeric_cols <- c( + 'sortScore', + 'matchedCardinality', + 'currentCardinality', + 'editDistance', + 'stemEditDistance', + 'cardinalityScore', + 'infraSpecificRankScore', + 'fuzzyLessScore', + 'curatedDataScore', + 'authorMatchScore', + 'acceptedNameScore', + 'parsingQualityScore' + ) + response_table[numeric_cols] <- lapply(response_table[numeric_cols], as.numeric) + response_table$isSynonym <- as.logical(response_table$isSynonym) + response_table <- tibble::as_tibble(response_table) + + return(response_table) +} \ No newline at end of file diff --git a/man/gna_verifier.Rd b/man/gna_verifier.Rd new file mode 100644 index 00000000..0fb275e0 --- /dev/null +++ b/man/gna_verifier.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gna_verifier.R +\name{gna_verifier} +\alias{gna_verifier} +\title{Verify a list of scientific names against biodiversity data-sources.} +\usage{ +gna_verifier( + names, + data_sources = c(1, 12), + all_matches = FALSE, + capitalize = FALSE, + species_group = FALSE, + fuzzy_uninomial = FALSE, + stats = FALSE, + main_taxon_threshold = 0.5, + output_type = "table", + ... +) +} +\arguments{ +\item{names}{A \code{character} vector of taxon names to verify.} + +\item{data_sources}{A \code{character} or \code{integer} vector with numbers +corresponding to data sources. See \code{\link[=gna_data_sources]{gna_data_sources()}} for a list of +available options.} + +\item{all_matches}{When \code{TRUE}, return all found matches, not only the best +one. Multiple results are returned in results. These results are sorted by +matching quality, the first result is the same as bestResult.} + +\item{capitalize}{When \code{TRUE}, capitalize the first letter of a name-string.} + +\item{species_group}{When \code{TRUE}, expands the search to species group where +applicable.} + +\item{fuzzy_uninomial}{When \code{TRUE}, allows fuzzy matching for uninomial +names.} + +\item{stats}{When \code{TRUE}, finds out a kingdom and a taxon (main taxon) that +contain most names. It only takes in account the names matched to the +Catalogue of Life entries. This option is ignored, if the Catalogue of Life +is not included in data-sources.} + +\item{main_taxon_threshold}{A \code{numeric} vector from 0.5 to 1. This sets the +minimal percentage for the main taxon discovery.} + +\item{output_type}{A \code{character} vector of length 1, one of \code{table}, \code{list}, +\code{json}, indicating the format of the output. The tabular output only +contains values that consistently appear in all results, so \code{list} or +\code{json} output can have additional information. For \code{list} and \code{json} +outputs, only values for unique taxon names are returned, but the \code{table} +output has rows that correspond 1-1 with the input data.} + +\item{...}{Curl options passed on to \link[crul:HttpClient]{crul::HttpClient}} +} +\description{ +This service parses incoming names, executes exact or fuzzy matching as +required, and returns the best-scored result. Optionally, it can also return +matches from data-sources selected by a user. +} +\examples{ +\dontrun{ +gna_verifier(c("Helianthus annuus", "Homo saapiens")) +gna_verifier(c("Helianthus annuus", "Homo saapiens"), all_matches = TRUE) +} +} +\author{ +Zachary S.L. Foster +}