library(modeldb)
library(duckdb)
library(dplyr)
library(dbplyr)
library(sqlm)
con <- DBI::dbConnect(duckdb::duckdb())
DBI::dbExecute(con, "
CREATE TABLE big100 AS
SELECT
random() AS y,
random() AS x1,
random() AS x2,
random() AS x3,
CASE (random() * 5)::int
WHEN 0 THEN 'A'
WHEN 1 THEN 'B'
WHEN 2 THEN 'C'
WHEN 3 THEN 'D'
ELSE 'E'
END AS grp
FROM generate_series(1, 100000000)
")
big_db <- tbl(con, "big100")Comparison with modeldb
The modeldb package by Edgar Ruiz and Max Kuhn also provides SQL-backed linear regression.
Below we compare the model’s syntax. Note they both have simliar performance.
Numeric predictors only
# sqlm
lm_sql(y ~ x1 + x2 + x3, data = big_db)
# modeldb
big_db |>
select(y, x1, x2, x3) |>
linear_regression_db(y_var = y, auto_count = TRUE)Categorical predictor (5 levels)
# sqlm
lm_sql(y ~ x1 + x2 + grp, data = big_db)
# modeldb
big_db |>
mutate(
grp_B = ifelse(grp == "B", 1, 0),
grp_C = ifelse(grp == "C", 1, 0),
grp_D = ifelse(grp == "D", 1, 0),
grp_E = ifelse(grp == "E", 1, 0)
) |>
select(y, x1, x2, grp_B, grp_C, grp_D, grp_E) |>
linear_regression_db(y_var = y, auto_count = TRUE)Interaction with categorical variable
# sqlm
lm_sql(y ~ x1 + x2 * grp, data = big_db)
# modeldb
big_db |>
mutate(
grp_B = ifelse(grp == "B", 1, 0),
grp_C = ifelse(grp == "C", 1, 0),
grp_D = ifelse(grp == "D", 1, 0),
grp_E = ifelse(grp == "E", 1, 0),
x2_grp_B = x2 * ifelse(grp == "B", 1, 0),
x2_grp_C = x2 * ifelse(grp == "C", 1, 0),
x2_grp_D = x2 * ifelse(grp == "D", 1, 0),
x2_grp_E = x2 * ifelse(grp == "E", 1, 0)
) |>
select(y, x1, x2, grp_B, grp_C, grp_D, grp_E,
x2_grp_B, x2_grp_C, x2_grp_D, x2_grp_E) |>
linear_regression_db(y_var = y, auto_count = TRUE))DBI::dbDisconnect(con, shutdown = TRUE)