Skip to content

Commit

Permalink
Use data-raw folder for building datasets from scratch
Browse files Browse the repository at this point in the history
  • Loading branch information
brownag committed May 18, 2022
1 parent 837f69e commit 73add67
Show file tree
Hide file tree
Showing 30 changed files with 3,257 additions and 217 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ README.Rmd
^_pkgdown\.yml$
^pkgdown$
^CRAN-SUBMISSION$
^data-raw$
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ URL: http://ncss-tech.github.io/soilDB/
BugReports: https://github.com/ncss-tech/soilDB/issues
RoxygenNote: 7.1.2
Roxygen: list(markdown = TRUE)
LazyData: false
2 changes: 1 addition & 1 deletion R/fetchSCAN.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ SCAN_site_metadata <- function(site.code = NULL) {
SCAN_SNOTEL_metadata <- NULL

# cached copy available in soilDB::SCAN_SNOTEL_metadata
load(system.file("data/SCAN_SNOTEL_metadata.rda", package="soilDB")[1])
load(system.file("data/SCAN_SNOTEL_metadata.rda", package = "soilDB")[1])

if (is.null(site.code)) {
idx <- 1:nrow(SCAN_SNOTEL_metadata)
Expand Down
4 changes: 2 additions & 2 deletions R/get_vegplot_data_from_NASIS_db.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ get_vegplot_location_from_NASIS_db <- function(SS = TRUE,
state_FIPS_codes <- NULL
# load FIPS codes from local package data
load(system.file("data/state_FIPS_codes.rda", package="soilDB"))
load(system.file("data/state_FIPS_codes.rda", package = "soilDB"))
# add ESIS_id
fips <- substr(d$site_id, 3, 5)
Expand All @@ -103,7 +103,7 @@ get_vegplot_location_from_NASIS_db <- function(SS = TRUE,
fips_state_num <- state_FIPS_codes$state_fips[idx]
year <- substr(d$site_id, 8, 9)
sitenum <- substr(d$site_id, 10, 12)
d$ESIS_id <- paste(sitenum, year, fips_state_num, fips, sep='')
d$ESIS_id <- paste(sitenum, year, fips_state_num, fips, sep = '')
# clean PLSS TRS data
d$plsstownship <- gsub(d$plsstownship, pattern = '\\.', replacement = '', fixed = TRUE)
Expand Down
55 changes: 55 additions & 0 deletions data-raw/NASIS_SoilProfileCollections.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# make SPC data sets from NASIS
library(aqp)
library(soilDB)

# load current data sets to fetch peiid
data("loafercreek", package = "soilDB")
data("gopheridge", package = "soilDB")
data("mineralKing", package = "soilDB")

# # create CSVs (requires NASIS setup)
# # query CA630 and CA792 w/ R08 PEDON/SITE by SSA ID or similar
# # load source data sets (CA630 and CA792 pedons)
# nasis_pedons <- fetchNASIS(rmHzErrors = FALSE, SS = FALSE)
#
# p <- rebuildSPC(subset(nasis_pedons, siteiid %in% as.double(c(loafercreek, gopheridge, mineralKing)$siteiid)))
# write.csv(horizons(p), "data-raw/spc-horizons.csv", row.names = FALSE)
# write.csv(site(p), "data-raw/spc-site.csv", row.names = FALSE)
# write.csv(diagnostic_hz(p), "data-raw/spc-diagnostic_hz.csv", row.names = FALSE)
# write.csv(restrictions(p), "data-raw/spc-restrictions.csv", row.names = FALSE)

recent1822a <- read.csv("data-raw/spc-horizons.csv")
depths(recent1822a) <- peiid ~ hzdept + hzdepb
site(recent1822a) <- read.csv("data-raw/spc-site.csv")
diagnostic_hz(recent1822a) <- read.csv("data-raw/spc-diagnostic_hz.csv")
restrictions(recent1822a) <- read.csv("data-raw/spc-restrictions.csv")

# ensure that phiid is set as hzID
hzidname(recent1822a) <- "phiid"
hzdesgnname(recent1822a) <- "hzname"
hztexclname(recent1822a) <- "texcl"

# subset
loafercreek2 <- rebuildSPC(subset(recent1822a, profile_id(recent1822a) %in% profile_id(loafercreek)))
gopheridge2 <- rebuildSPC(subset(recent1822a, profile_id(recent1822a) %in% profile_id(gopheridge)))
mineralKing2 <- rebuildSPC(subset(recent1822a, profile_id(recent1822a) %in% profile_id(mineralKing)))

# ensure that phiid is set as hzID
hzidname(loafercreek2) <- "phiid"
hzidname(gopheridge2) <- "phiid"
hzidname(mineralKing2) <- "phiid"

# verify completeness
if (all(profile_id(loafercreek) %in% profile_id(loafercreek2)))
loafercreek <- loafercreek2

if (all(profile_id(gopheridge) %in% profile_id(gopheridge2)))
gopheridge <- gopheridge2

if (all(profile_id(mineralKing) %in% profile_id(mineralKing2)))
mineralKing <- mineralKing2

# save to .rda
usethis::use_data(loafercreek, overwrite = TRUE, compress = 'xz')
usethis::use_data(gopheridge, overwrite = TRUE, compress = 'xz')
usethis::use_data(mineralKing, overwrite = TRUE, compress = 'xz')
8 changes: 3 additions & 5 deletions misc/make-metadata.R → data-raw/NASIS_metadata.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
library(soilDB)

## code to prepare `metadata` dataset goes here
# make data/metadata.rda (used by uncode() when NASIS not available)
metadata <- soilDB:::.get_NASIS_metadata()
save(metadata, file = "data/metadata.rda")

head(metadata)
metadata <- soilDB:::.get_NASIS_metadata()
usethis::use_data(metadata, overwrite = TRUE, compress = 'xz')
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ table_fkeys <- sapply(table_colnames, function(x) {

table_pkeys <- sapply(table_colnames, function(x) {
idx <- which(grepl("iid$", x) & !grepl("dbiid|tbl_", x))
if(length(idx) == 0) return(NA)
if (length(idx) == 0) return(NA)
x[idx[length(idx)]]
})

Expand Down Expand Up @@ -117,8 +117,6 @@ NASIS_table_column_keys <- data.frame(table = table_names,
# })
# which(!cmpr_oldnu)

save(NASIS_table_column_keys, file = "data/NASIS_table_column_keys.rda")

View(NASIS_table_column_keys)
usethis::use_data(NASIS_table_column_keys, overwrite = TRUE, compress = 'xz')


4 changes: 4 additions & 0 deletions data-raw/SCAN_SNOTEL_metadata.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
## code to prepare `SCAN_SNOTEL_metadata` dataset goes here
SCAN_SNOTEL_metadata <- read.csv("data-raw/station-metadata.csv", row.names = FALSE)

usethis::use_data(SCAN_SNOTEL_metadata, overwrite = TRUE, compress = 'xz')
9 changes: 9 additions & 0 deletions data-raw/SDV_themes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# library(soilDB)
# x <- SDA_query("SELECT DISTINCT attributekey FROM sdvattribute")
# res <- get_SDV_legend_elements(paste0("attributekey = ", x$attributekey))
# resall <- data.table::rbindlist(res, fill = TRUE)
# sdvmaplegend <- resall
#
# # including attributedescription makes rda 10x bigger
# sdvmaplegend$attributedescription <- NULL
# save(sdvattribute, file="misc/sdvmaplegend.rda")
153 changes: 153 additions & 0 deletions data-raw/make-scan-snotel-metadata-db.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# source data for this and output can be found in scan-snotel-data folder.
# See SCAN_SNOTEL_metadata.R for current build routine which is abbreviated/derived from this

# library(soilDB)
# library(rvest)
# library(xml)
# library(plyr)
#
# # https://github.com/ncss-tech/soilDB/issues/61
#
# # attempt to cross-reference a lab ID via pedon ID
# # using a LIMS report and HTML scraping
# # about 5 seconds per request
# getLabPedon <- function(pedonID) {
# url <- sprintf('https://nasis.sc.egov.usda.gov/NasisReportsWebSite/limsreport.aspx?report_name=Pedon+Description+html+(userpedid)&pedon_id=%s', pedonID)
#
# rpt <- read_html(url)
# n <- html_node(rpt, xpath = "//*/table/tr[10]/td[1]/*")
# n <- xml_text(n)
# lab.id <- gsub(' ', '', strsplit(n, ':')[[1]][2])
#
# return(lab.id)
# }
#
# getLabPedon <- Vectorize(getLabPedon)
#
#
#
# ##
# ## station list / site information
# ##
#
# # 2021-02-25 DEB update site data from www map
#
# # get these data from SCAN/SNOTEL www map, zoom all the way out and then click on export to CSV
# # there are some trash data in here, trailing tabs
# x <- read.csv('scan-snotel-data/scan-snotel-site-data.csv', stringsAsFactors = FALSE, colClasses = 'character')
#
# # fix formatting
# x$Name <- trimws(x$Name)
# x$ID <- as.numeric(trimws(x$ID))
# x$State <- trimws(x$State)
# x$Network <- trimws(x$Network)
# x$County <- trimws(x$County)
# x$Elevation_ft <- as.numeric(trimws(x$Elevation_ft))
# x$Latitude <- as.numeric(trimws(x$Latitude))
# x$Longitude <- as.numeric(trimws(x$Longitude))
# x$HUC <- trimws(x$HUC)
#
# # re-name ID
# names(x)[2] <- 'Site'
#
# # check: OK
# nrow(x)
# str(x)
#
#
# ##
# ## pedon / lab IDs
# ##
#
# ## SCAN / SNOTEL sites from western US
# # most of these files are maintained by regional staff
# # naming convention from NASIS site table
# p.west <- read.csv('scan-snotel-data/Utah_DCO_Soil_Lab_Data.csv', stringsAsFactors = FALSE)
#
# # whats in here:
# # many sites from several states!
# str(p.west)
# table(p.west$state)
#
# # keep subset of columns
# p.west <- p.west[, c('climstaid', 'climstanm', 'upedonid', 'pedlabsampnum')]
#
# # re-name ID
# names(p.west)[1] <- 'Site'
#
# # re-name for mixing
# names(p.west)[-1] <- paste0(names(p.west)[-1], '-WEST')
#
# # check: ok
# str(p.west)
#
#
#
# ## SCAN data via Steve Campbell / soil climate center
# # missing lab IDs
# # missing SNOTEL sites
# p.scan <- read.csv('scan-snotel-data/SCAN_Pedon_Master.csv', stringsAsFactors = FALSE)
#
# str(p.scan)
# table(p.scan$State)
#
# # re-name to match other metadata
# names(p.scan) <- c('Site', 'climstanm', 'state', 'upedonid')
#
# # look-up lab ID via LIMS report
# # takes a couple of minutes
# # some pedon IDs won't map to a lab ID (not linked in NASIS)
# p.scan$pedlabsampnum <- getLabPedon(p.scan$upedonid)
#
# # replace missing values with NA
# p.scan$pedlabsampnum[which(p.scan$pedlabsampnum == '')] <- NA
#
# # re-name and subset columns
# p.scan <- p.scan[, c('Site', 'climstanm', 'upedonid', 'pedlabsampnum')]
# names(p.scan)[-1] <- paste0(names(p.scan)[-1], '-SCAN')
#
#
# ##
# ## merge metadata from various sources, filling in the missing values with best available data
# ##
#
# # unique set of site IDs
# m <- data.frame(Site=unique(c(p.west$Site, p.scan$Site)), stringsAsFactors = FALSE)
#
# # western data
# m <- join(m, p.west, by='Site', type='left')
#
# # SCAN master list
# m <- join(m, p.scan, by='Site', type='left')
#
# # new columns for best-available
# m$climstanm <- NA
# m$upedonid <- NA
# m$pedlabsampnum <- NA
#
# ### TODO: double check logic
# # select best available
# m$climstanm <- ifelse(! is.na(m$`climstanm-WEST`), m$`climstanm-WEST`, m$`climstanm-SCAN`)
# m$upedonid <- ifelse(! is.na(m$`upedonid-WEST`), m$`upedonid-WEST`, m$`upedonid-SCAN`)
# m$pedlabsampnum <- ifelse(! is.na(m$`pedlabsampnum-WEST`), m$`pedlabsampnum-WEST`, m$`pedlabsampnum-SCAN`)
#
#
# ##
# ## combine site metadata and pedon links
# ##
#
# SCAN_SNOTEL_metadata <- join(x, m[, c('Site', 'climstanm', 'upedonid', 'pedlabsampnum')], by='Site', type='left')
#
# # check for possible errors via station name comparison
# idx <- which( ! SCAN_SNOTEL_metadata$Name == SCAN_SNOTEL_metadata$climstanm )
# SCAN_SNOTEL_metadata[idx, c('Site', 'Name', 'climstanm')]
#
# # hmm... mostly abbreviations and spelling
#
# # save as R data file
# save(SCAN_SNOTEL_metadata, file='../data/SCAN_SNOTEL_metadata.rda')
#
#
#
#
#
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 73add67

Please sign in to comment.