Skip to content

Commit

Permalink
added gna_verifier to replace gnr_resolve
Browse files Browse the repository at this point in the history
Relates to  #940
  • Loading branch information
Zachary Foster committed Dec 13, 2024
1 parent c4cc16a commit 4f51b46
Show file tree
Hide file tree
Showing 3 changed files with 261 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ export(getkey)
export(gisd_isinvasive)
export(gna_parse)
export(gna_search)
export(gna_verifier)
export(gni_details)
export(gni_parse)
export(gni_seach)
Expand Down
191 changes: 191 additions & 0 deletions R/gna_verifier.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#' Verify a list of scientific names against biodiversity data-sources.
#'
#' This service parses incoming names, executes exact or fuzzy matching as
#' required, and returns the best-scored result. Optionally, it can also return
#' matches from data-sources selected by a user.
#'
#' @export
#'
#' @param names A `character` vector of taxon names to verify.
#' @param data_sources A `character` or `integer` vector with numbers
#' corresponding to data sources. See [gna_data_sources()] for a list of
#' available options.
#' @param all_matches When `TRUE`, return all found matches, not only the best
#' one. Multiple results are returned in results. These results are sorted by
#' matching quality, the first result is the same as bestResult.
#' @param capitalize When `TRUE`, capitalize the first letter of a name-string.
#' @param species_group When `TRUE`, expands the search to species group where
#' applicable.
#' @param fuzzy_uninomial When `TRUE`, allows fuzzy matching for uninomial
#' names.
#' @param stats When `TRUE`, finds out a kingdom and a taxon (main taxon) that
#' contain most names. It only takes in account the names matched to the
#' Catalogue of Life entries. This option is ignored, if the Catalogue of Life
#' is not included in data-sources.
#' @param main_taxon_threshold A `numeric` vector from 0.5 to 1. This sets the
#' minimal percentage for the main taxon discovery.
#' @param output_type A `character` vector of length 1, one of `table`, `list`,
#' `json`, indicating the format of the output. The tabular output only
#' contains values that consistently appear in all results, so `list` or
#' `json` output can have additional information. For `list` and `json`
#' outputs, only values for unique taxon names are returned, but the `table`
#' output has rows that correspond 1-1 with the input data.
#' @param ... Curl options passed on to [crul::HttpClient]
#'
#' @return Depends on the value of the `output_type` option
#'
#' @author Zachary S.L. Foster
#'
#' @examples \dontrun{
#' gna_verifier(c("Helianthus annuus", "Homo saapiens"))
#' gna_verifier(c("Helianthus annuus", "Homo saapiens"), all_matches = TRUE)
#' }
gna_verifier <- function(
names,
data_sources = c(1, 12),
all_matches = FALSE,
capitalize = FALSE,
species_group = FALSE,
fuzzy_uninomial = FALSE,
stats = FALSE,
main_taxon_threshold = 0.5,
output_type = 'table',
...
) {
# Parse and verify input options
data_sources <- as.character(data_sources)
is_number <- grepl(data_sources, pattern = '[0-9]+')
if (any(! is_number) || length(data_sources) == 0) {
stop(call. = FALSE, 'The `data_sources` input must be a vector of numbers with at least one value.')
}
data_sources <- paste0(data_sources, collapse = '|')

check_if_logical <- function(value, name) {
if (length(value) != 1 || is.na(value) || ! is.logical(value)) {
stop(call. = FALSE, 'The `', name, '` input must be a TRUE/FALSE vector of length 1.')
}
}
check_if_logical(all_matches, 'all_matches')
check_if_logical(capitalize, 'capitalize')
check_if_logical(species_group, 'species_group')
check_if_logical(fuzzy_uninomial, 'fuzzy_uninomial')
check_if_logical(stats, 'stats')

if (length(main_taxon_threshold) != 1 || ! is.numeric(main_taxon_threshold) || main_taxon_threshold < 0.5 || main_taxon_threshold > 1) {
stop(call. = FALSE, 'The `main_taxon_threshold` input must be a single number between 0.5 to 1')
}

output_type <- match.arg(output_type, c('table', 'list', 'json'))

# Convert input to unique values to avoid redundant API wprk
unique_names <- unique(names)

# Format the API GET request
base_url <- 'https://verifier.globalnames.org/'
args <- c(
data_sources = paste0(data_sources, collapse = '|'),
all_matches = tolower(as.character(all_matches)),
capitalize = tolower(as.character(capitalize)),
species_group = tolower(as.character(species_group)),
fuzzy_uninomial = tolower(as.character(fuzzy_uninomial)),
stats = tolower(as.character(stats))
)
formatted_args <- paste0(paste0(names(args), '=', args), collapse = '&')
formatted_path <- paste0(
'api/v1/verifications/',
paste0(unique_names, collapse = '|'),
'?', formatted_args
)

# Make and parse API call
api <- crul::HttpClient$new(base_url, headers = tx_ual, opts = list(...))
response <- api$get(path = formatted_path)
response$raise_for_status()
response_json <- response$parse("UTF-8")
if (output_type == 'json') {
return(response_json)
}
response_data <- jsonlite::fromJSON(response_json, FALSE)
if (output_type == 'list') {
return(response_data)
}

# Reformat response data to a table
used_cols <- c(
'submittedName',
'dataSourceId',
'dataSourceTitleShort',
'curation',
'recordId',
'outlink',
'entryDate',
'sortScore',
'matchedNameID',
'matchedName',
'matchedCardinality',
'matchedCanonicalSimple',
'matchedCanonicalFull',
'currentRecordId',
'currentNameId',
'currentName',
'currentCardinality',
'currentCanonicalSimple',
'currentCanonicalFull',
'taxonomicStatus',
'isSynonym',
'editDistance',
'stemEditDistance',
'matchType',
'cardinalityScore',
'infraSpecificRankScore',
'fuzzyLessScore',
'curatedDataScore',
'authorMatchScore',
'acceptedNameScore',
'parsingQualityScore'
)
convert_entry_to_row <- function(x, input_name) {
output <- c(input_name, unlist(x))
names(output)[1] <- 'submittedName'
parsed_names <- vapply(strsplit(names(output), split = '\\.'),
function(y) y[length(y)], FUN.VALUE = character(1))
names(output) <- parsed_names
return(as.data.frame(as.list(output[used_cols])))
}
if (all_matches) {
response_table <- do.call(rbind, lapply(response_data$names, function(x) {
do.call(rbind, lapply(x$results, function(y) convert_entry_to_row(y, x$name)))
}))
} else {
response_table <- do.call(rbind, lapply(response_data$names, function(x) {
convert_entry_to_row(x$bestResult, x$name)
}))
}

# Reduplicate rows to match input
split_table <- split(response_table, response_table$submittedName)
submitted_names <- vapply(split_table, function(x) x$submittedName[1], FUN.VALUE = character(1))
response_table <- do.call(rbind, split_table[match(names, submitted_names)])
row.names(response_table) <- NULL

# Format table to be more useful to the user
numeric_cols <- c(
'sortScore',
'matchedCardinality',
'currentCardinality',
'editDistance',
'stemEditDistance',
'cardinalityScore',
'infraSpecificRankScore',
'fuzzyLessScore',
'curatedDataScore',
'authorMatchScore',
'acceptedNameScore',
'parsingQualityScore'
)
response_table[numeric_cols] <- lapply(response_table[numeric_cols], as.numeric)
response_table$isSynonym <- as.logical(response_table$isSynonym)
response_table <- tibble::as_tibble(response_table)

return(response_table)
}
69 changes: 69 additions & 0 deletions man/gna_verifier.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 4f51b46

Please sign in to comment.