---
title: "Benchmark"
date: "`r Sys.Date()`"
output:
  html_document:
    code_folding: hide   # or "hide" "show"
code-annotations: hover
urlcolor: blue
vignette: >
  %\VignetteIndexEntry{Benchmark}
  %\VignetteEngine{knitr::rmarkdown}
  \usepackage[utf8]{inputenc}
---
  
```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = identical(tolower(Sys.getenv("NOT_CRAN")), "true"),
  out.width = "100%"
)

# Initialize variables to prevent "object not found" error when eval=FALSE
time_diff <- "XX"
memo_diff <- "XX"

# CRAN OMP THREAD LIMIT to avoid CRAN NOTE
Sys.setenv(OMP_THREAD_LIMIT = 2)
```

This vignette shows a few benchmarks comparing **{duckspatial}** vs **{sf}**. We 
look at how both packages compare in terms of computation time and memory use 
when performing different spatial operations with increasingly large data sets. 
We plan to extend this vignette in the future to benchmark other types of spatial 
operations.

## TL;DR

- {duckspatial} is substantially faster and uses way less memory than 
{sf} in pretty much all cases, particularly when working with large data sets


## Prepare data for benchmark

This chunk of code below loads a few libraries and create a few sample data sets
used in this benchmark.
```{r, message = FALSE, warning=FALSE}
library(duckspatial)
library(bench)
library(dplyr)
library(sf)
library(lwgeom)
library(ggplot2)
options(scipen = 999)

# read polygons data
countries_sf <- sf::st_read(system.file("spatial/countries.geojson", package = "duckspatial"))

# generate random points
set.seed(42)

## create points data
n = 10e4
points_sf_100k <- data.frame(
    id = 1:n,
    x = runif(n, min = -180, max = 180),  
    y = runif(n, min = -90, max = 90)
    ) |> 
    sf::st_as_sf(coords = c("x", "y"), crs = 4326)

n = 10e5
points_sf_1mi <- data.frame(
    id = 1:n,
    x = runif(n, min = -180, max = 180),  
    y = runif(n, min = -90, max = 90)
    ) |> 
    sf::st_as_sf(coords = c("x", "y"), crs = 4326)

# n = 10e6
# points_sf_10mi <- data.frame(
#     id = 1:n,
#     x = runif(n, min = -180, max = 180),  
#     y = runif(n, min = -90, max = 90)
#     ) |> 
#     sf::st_as_sf(coords = c("x", "y"), crs = 4326)

```
# Spatial Join

```{r, message = FALSE}
run_benchmark <- function(points_sf){
    
    temp_bench <- bench::mark(
        iterations = 1, 
        check = FALSE, 
        duckspatial = duckspatial::ddbs_join(
            x = points_sf, 
            y = countries_sf, 
            join = "within"),
        
        sf = sf::st_join(
            x = points_sf, 
            y = countries_sf, 
            join = sf::st_within)
        )
    
    temp_bench$n <- nrow(points_sf)
    temp_bench$pkg <- c("duckspatial", "sf")
    
    return(temp_bench)
}

# From 100K points to 1 million and 10 million points
df_bench_join <- lapply(
    X = list(points_sf_100k, points_sf_1mi),
    FUN = run_benchmark
    ) |> 
    dplyr::bind_rows()


# calculate difference in performance
temp <- df_bench_join |> 
    filter(n == 10e5)

memo_diff <- round(as.numeric(temp$mem_alloc[2] / temp$mem_alloc[1]),1)
time_diff <- (1 - round(as.numeric(temp$median[1] / temp$median[2]),2))*100
```

In this example working with 1 million points, {duckspatial} was `r time_diff`% 
faster and used `r memo_diff` times less memory than {sf}. Not bad.


```{r, warning=FALSE}
ggplot(data = df_bench_join) +
    geom_point(size =3, aes(x= mem_alloc, y = median, color = pkg, 
                    shape = format(n, big.mark = ".")
                    )) +
    labs(color= "Package", shape = "Data size",
         y = "Computation time (seconds)",
         x = "Memory allocated") +
    theme_minimal()


```


# Spatial filter

```{r, message = FALSE}
run_benchmark <- function(points_sf){
    
    temp_bench <- bench::mark(
        iterations = 1, 
        check = FALSE, 
        duckspatial = duckspatial::ddbs_filter(
            x = points_sf, 
            y = countries_sf),
        
        sf = sf::st_filter(
            x = points_sf, 
            y = countries_sf)
        )
    
    temp_bench$n <- nrow(points_sf)
    temp_bench$pkg <- c("duckspatial", "sf")
    
    return(temp_bench)
}


# From 100K points to 1 million and 10 million points
df_bench_filter <- lapply(
    X = list(points_sf_100k, points_sf_1mi),
    FUN = run_benchmark
    ) |> 
    dplyr::bind_rows()


# calculate difference in performance
temp <- df_bench_filter |> 
    filter(n == 10e5)

memo_diff <- round(as.numeric(temp$mem_alloc[2] / temp$mem_alloc[1]),1)
time_diff <- (1 - round(as.numeric(temp$median[1] / temp$median[2]),2))*100
```
In this example working with 1 million points, {duckspatial} was `r time_diff`% 
faster and used `r memo_diff` times less memory than {sf}. 

plot

```{r, warning=FALSE}
ggplot(data = df_bench_filter) +
    geom_point(size =3, aes(x= mem_alloc, y = median, color = pkg, 
                    shape = format(n, big.mark = ".")
                    )) +
    labs(color= "Package", shape = "Data size",
         y = "Computation time (seconds)",
         x = "Memory allocated") +
    theme_minimal()


```




# Spatial distances

```{r, message = FALSE}
# Turn on S2 (Spherical geometry)
sf::sf_use_s2(TRUE)

run_benchmark <- function(n){
    
    set.seed(42)

    ## create points data
    points_sf <- data.frame(
        id = 1:n,
        x = runif(n, min = -180, max = 180),  
        y = runif(n, min = -90, max = 90)
        ) |> 
        sf::st_as_sf(coords = c("x", "y"), crs = 4326)
    
    temp_bench <- bench::mark(
        iterations = 1, 
        check = FALSE, 
        duckspatial = duckspatial::ddbs_distance(
            x = points_sf, 
            y = points_sf, 
            dist_type = "haversine"),
        
        sf = sf::st_distance(
            x = points_sf, 
            y = points_sf, 
            which = "Great Circle")
        )
    
    temp_bench$n <- nrow(points_sf)
    temp_bench$pkg <- c("duckspatial", "sf")
    
    return(temp_bench)
}


# From 100K points to 1 million and 10 million points
df_bench_distance <- lapply(
    X = c(500, 1000, 10000),
    FUN = run_benchmark
    ) |> 
    dplyr::bind_rows()


# calculate difference in performance
temp <- df_bench_distance |> 
    filter(n == 10000)

memo_diff <- round(as.numeric(temp$mem_alloc[1]) / as.numeric(temp$mem_alloc[2]) ,1)
time_diff <- (1 - round(as.numeric(temp$median[1] / temp$median[2]),2))*100
```

In this example calculating the distance between 10K points, {duckspatial} was `r time_diff`% 
faster, but used `r memo_diff` times more memory than {sf}. Mind you that {sf} is 
still more efficient when calculating Euclidean distances.

plot

```{r, warning=FALSE}
ggplot(data = df_bench_distance) +
    geom_point(size =3, aes(x= mem_alloc, y = median, color = pkg, 
                    shape = format(n, big.mark = ".")
                    )) +
    labs(color= "Package", shape = "Data size",
         y = "Computation time (seconds)",
         x = "Memory allocated") +
    theme_minimal()


```