--- title: "Benchmark" date: "`r Sys.Date()`" output: html_document: code_folding: hide # or "hide" "show" code-annotations: hover urlcolor: blue vignette: > %\VignetteIndexEntry{Benchmark} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = identical(tolower(Sys.getenv("NOT_CRAN")), "true"), out.width = "100%" ) # Initialize variables to prevent "object not found" error when eval=FALSE time_diff <- "XX" memo_diff <- "XX" # CRAN OMP THREAD LIMIT to avoid CRAN NOTE Sys.setenv(OMP_THREAD_LIMIT = 2) ``` This vignette shows a few benchmarks comparing **{duckspatial}** vs **{sf}**. We look at how both packages compare in terms of computation time and memory use when performing different spatial operations with increasingly large data sets. We plan to extend this vignette in the future to benchmark other types of spatial operations. ## TL;DR - {duckspatial} is substantially faster and uses way less memory than {sf} in pretty much all cases, particularly when working with large data sets ## Prepare data for benchmark This chunk of code below loads a few libraries and create a few sample data sets used in this benchmark. ```{r, message = FALSE, warning=FALSE} library(duckspatial) library(bench) library(dplyr) library(sf) library(lwgeom) library(ggplot2) options(scipen = 999) # read polygons data countries_sf <- sf::st_read(system.file("spatial/countries.geojson", package = "duckspatial")) # generate random points set.seed(42) ## create points data n = 10e4 points_sf_100k <- data.frame( id = 1:n, x = runif(n, min = -180, max = 180), y = runif(n, min = -90, max = 90) ) |> sf::st_as_sf(coords = c("x", "y"), crs = 4326) n = 10e5 points_sf_1mi <- data.frame( id = 1:n, x = runif(n, min = -180, max = 180), y = runif(n, min = -90, max = 90) ) |> sf::st_as_sf(coords = c("x", "y"), crs = 4326) # n = 10e6 # points_sf_10mi <- data.frame( # id = 1:n, # x = runif(n, min = -180, max = 180), # y = runif(n, min = -90, max = 90) # ) |> # sf::st_as_sf(coords = c("x", "y"), crs = 4326) ``` # Spatial Join ```{r, message = FALSE} run_benchmark <- function(points_sf){ temp_bench <- bench::mark( iterations = 1, check = FALSE, duckspatial = duckspatial::ddbs_join( x = points_sf, y = countries_sf, join = "within"), sf = sf::st_join( x = points_sf, y = countries_sf, join = sf::st_within) ) temp_bench$n <- nrow(points_sf) temp_bench$pkg <- c("duckspatial", "sf") return(temp_bench) } # From 100K points to 1 million and 10 million points df_bench_join <- lapply( X = list(points_sf_100k, points_sf_1mi), FUN = run_benchmark ) |> dplyr::bind_rows() # calculate difference in performance temp <- df_bench_join |> filter(n == 10e5) memo_diff <- round(as.numeric(temp$mem_alloc[2] / temp$mem_alloc[1]),1) time_diff <- (1 - round(as.numeric(temp$median[1] / temp$median[2]),2))*100 ``` In this example working with 1 million points, {duckspatial} was `r time_diff`% faster and used `r memo_diff` times less memory than {sf}. Not bad. ```{r, warning=FALSE} ggplot(data = df_bench_join) + geom_point(size =3, aes(x= mem_alloc, y = median, color = pkg, shape = format(n, big.mark = ".") )) + labs(color= "Package", shape = "Data size", y = "Computation time (seconds)", x = "Memory allocated") + theme_minimal() ``` # Spatial filter ```{r, message = FALSE} run_benchmark <- function(points_sf){ temp_bench <- bench::mark( iterations = 1, check = FALSE, duckspatial = duckspatial::ddbs_filter( x = points_sf, y = countries_sf), sf = sf::st_filter( x = points_sf, y = countries_sf) ) temp_bench$n <- nrow(points_sf) temp_bench$pkg <- c("duckspatial", "sf") return(temp_bench) } # From 100K points to 1 million and 10 million points df_bench_filter <- lapply( X = list(points_sf_100k, points_sf_1mi), FUN = run_benchmark ) |> dplyr::bind_rows() # calculate difference in performance temp <- df_bench_filter |> filter(n == 10e5) memo_diff <- round(as.numeric(temp$mem_alloc[2] / temp$mem_alloc[1]),1) time_diff <- (1 - round(as.numeric(temp$median[1] / temp$median[2]),2))*100 ``` In this example working with 1 million points, {duckspatial} was `r time_diff`% faster and used `r memo_diff` times less memory than {sf}. plot ```{r, warning=FALSE} ggplot(data = df_bench_filter) + geom_point(size =3, aes(x= mem_alloc, y = median, color = pkg, shape = format(n, big.mark = ".") )) + labs(color= "Package", shape = "Data size", y = "Computation time (seconds)", x = "Memory allocated") + theme_minimal() ``` # Spatial distances ```{r, message = FALSE} # Turn on S2 (Spherical geometry) sf::sf_use_s2(TRUE) run_benchmark <- function(n){ set.seed(42) ## create points data points_sf <- data.frame( id = 1:n, x = runif(n, min = -180, max = 180), y = runif(n, min = -90, max = 90) ) |> sf::st_as_sf(coords = c("x", "y"), crs = 4326) temp_bench <- bench::mark( iterations = 1, check = FALSE, duckspatial = duckspatial::ddbs_distance( x = points_sf, y = points_sf, dist_type = "haversine"), sf = sf::st_distance( x = points_sf, y = points_sf, which = "Great Circle") ) temp_bench$n <- nrow(points_sf) temp_bench$pkg <- c("duckspatial", "sf") return(temp_bench) } # From 100K points to 1 million and 10 million points df_bench_distance <- lapply( X = c(500, 1000, 10000), FUN = run_benchmark ) |> dplyr::bind_rows() # calculate difference in performance temp <- df_bench_distance |> filter(n == 10000) memo_diff <- round(as.numeric(temp$mem_alloc[1]) / as.numeric(temp$mem_alloc[2]) ,1) time_diff <- (1 - round(as.numeric(temp$median[1] / temp$median[2]),2))*100 ``` In this example calculating the distance between 10K points, {duckspatial} was `r time_diff`% faster, but used `r memo_diff` times more memory than {sf}. Mind you that {sf} is still more efficient when calculating Euclidean distances. plot ```{r, warning=FALSE} ggplot(data = df_bench_distance) + geom_point(size =3, aes(x= mem_alloc, y = median, color = pkg, shape = format(n, big.mark = ".") )) + labs(color= "Package", shape = "Data size", y = "Computation time (seconds)", x = "Memory allocated") + theme_minimal() ```