Comparison with modeldb

The modeldb package by Edgar Ruiz and Max Kuhn also provides SQL-backed linear regression.

Below we compare the model’s syntax. Note they both have simliar performance.

library(modeldb)
library(duckdb)
library(dplyr)
library(dbplyr)
library(sqlm)

con <- DBI::dbConnect(duckdb::duckdb())

DBI::dbExecute(con, "
  CREATE TABLE big100 AS
  SELECT
    random() AS y,
    random() AS x1,
    random() AS x2,
    random() AS x3,
    CASE (random() * 5)::int
      WHEN 0 THEN 'A'
      WHEN 1 THEN 'B'
      WHEN 2 THEN 'C'
      WHEN 3 THEN 'D'
      ELSE 'E'
    END AS grp
  FROM generate_series(1, 100000000)
")

big_db <- tbl(con, "big100")

Numeric predictors only

# sqlm
lm_sql(y ~ x1 + x2 + x3, data = big_db)


# modeldb
big_db |>
  select(y, x1, x2, x3) |>
  linear_regression_db(y_var = y, auto_count = TRUE)

Categorical predictor (5 levels)

# sqlm
lm_sql(y ~ x1 + x2 + grp, data = big_db)


# modeldb
big_db |>
  mutate(
    grp_B = ifelse(grp == "B", 1, 0),
    grp_C = ifelse(grp == "C", 1, 0),
    grp_D = ifelse(grp == "D", 1, 0),
    grp_E = ifelse(grp == "E", 1, 0)
  ) |>
  select(y, x1, x2, grp_B, grp_C, grp_D, grp_E) |>
  linear_regression_db(y_var = y, auto_count = TRUE)

Interaction with categorical variable

# sqlm
lm_sql(y ~ x1 + x2 * grp, data = big_db)

# modeldb
big_db |>
  mutate(
    grp_B = ifelse(grp == "B", 1, 0),
    grp_C = ifelse(grp == "C", 1, 0),
    grp_D = ifelse(grp == "D", 1, 0),
    grp_E = ifelse(grp == "E", 1, 0),
    x2_grp_B = x2 * ifelse(grp == "B", 1, 0),
    x2_grp_C = x2 * ifelse(grp == "C", 1, 0),
    x2_grp_D = x2 * ifelse(grp == "D", 1, 0),
    x2_grp_E = x2 * ifelse(grp == "E", 1, 0)
  ) |>
  select(y, x1, x2, grp_B, grp_C, grp_D, grp_E,
         x2_grp_B, x2_grp_C, x2_grp_D, x2_grp_E) |>
  linear_regression_db(y_var = y, auto_count = TRUE))
DBI::dbDisconnect(con, shutdown = TRUE)