## ----include = FALSE---------------------------------------------------------------------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) # Increase width for printing tibbles old <- options(width = 140) ## ----setup, message = FALSE--------------------------------------------------------------------------------------------------------------- library(dwctaxon) library(readr) library(tibble) library(dplyr) ## ----download-setup----------------------------------------------------------------------------------------------------------------------- # - Specify temporary folder for downloading data temp_dir <- tempdir() # - Set name of zip file temp_zip <- paste0(temp_dir, "/dwca-vascan.zip") # - Set name of unzipped folder temp_unzip <- paste0(temp_dir, "/dwca-vascan") ## ----get-vascan-url----------------------------------------------------------------------------------------------------------------------- source(system.file("extdata", "vascan_url.R", package = "dwctaxon")) # Check that we now have the URL loaded: vascan_url ## ----echo = FALSE, results = "asis"------------------------------------------------------------------------------------------------------- # Check if file can be downloaded safely, quit early if not if (!dwctaxon:::safe_to_download(vascan_url)) { cat( paste0( "Vignette rendering stopped. The zip file (", vascan_url, ") could not be downloaded. Check your internet connection and the URL." ) ) knitr::knit_exit() } ## ----download-unzip-hide, include = FALSE------------------------------------------------------------------------------------------------- # Download and unzip data download_success <- dwctaxon:::safe_download_unzip( url = vascan_url, destfile = temp_zip, exdir = temp_unzip ) # Check if download or unzip failed if (!download_success) { message("Zip file could not be loaded. Stopping vignette rendering.") knitr::knit_exit() } ## ----download-unzip-show, eval = FALSE---------------------------------------------------------------------------------------------------- # # Download data # download.file(url = vascan_url, destfile = temp_zip, mode = "wb") # # # Unzip # unzip(temp_zip, exdir = temp_unzip) ## ----list-zip-contents-------------------------------------------------------------------------------------------------------------------- list.files(temp_unzip) ## ----load-data---------------------------------------------------------------------------------------------------------------------------- vascan <- read_tsv(paste0(temp_unzip, "/taxon.txt")) # Take a peak at the data vascan ## ----validation, error = TRUE------------------------------------------------------------------------------------------------------------- try({ dct_validate(vascan) }) ## ----validation-summary------------------------------------------------------------------------------------------------------------------- validation_res <- dct_validate(vascan, on_fail = "summary") validation_res ## ----summary-analysis--------------------------------------------------------------------------------------------------------------------- validation_res %>% count(check, error) ## ----summary-analysis-hide, show = FALSE, echo = FALSE------------------------------------------------------------------------------------ validation_res_sum <- validation_res %>% count(check, error) n_error_types <- nrow(validation_res_sum) %>% english::english() n_bad_cols <- validation_res_sum %>% filter(error == "Invalid column names detected: id") %>% pull(n) %>% english::english() n_bad_sci_name <- validation_res_sum %>% filter(error == "scientificName detected with duplicated value") %>% pull(n) ## ----check-sci-name-dups------------------------------------------------------------------------------------------------------------------ dup_names <- validation_res %>% filter(grepl("scientificName detected with duplicated value", error)) %>% arrange(scientificName) dup_names ## ----check-sci-name-dups-orig------------------------------------------------------------------------------------------------------------- inner_join( select(dup_names, taxonID), vascan, by = "taxonID" ) %>% # Just look at the first 6 columns select(1:6) ## ----inspect-id--------------------------------------------------------------------------------------------------------------------------- vascan %>% select(id) n_distinct(vascan$id) ## ----fix-data----------------------------------------------------------------------------------------------------------------------------- vascan_fixed <- vascan %>% filter(!duplicated(scientificName)) ## ----validation-2------------------------------------------------------------------------------------------------------------------------- dct_validate( vascan_fixed, extra_cols = "id" ) ## ----include = FALSE---------------------------------------------------------- # Reset options options(old)