Operators
John Mount, Nina Zumel
2023-08-19
cdata
recommends an operator idiom to apply data
transforms.
The idea is simple, yet powerful.
First let’s start with some data.
d <- wrapr::build_frame(
"model_id" , "measure", "value" |
1 , "AUC" , 0.7 |
1 , "R2" , 0.4 |
2 , "AUC" , 0.8 |
2 , "R2" , 0.5 )
knitr::kable(d)
1 |
AUC |
0.7 |
1 |
R2 |
0.4 |
2 |
AUC |
0.8 |
2 |
R2 |
0.5 |
In the above data we have two measurements each for two individuals
(individuals identified by the “model_id
” column). Using
cdata
’s rowrecs_to_blocks_spec()
method we can
capture a description of this record structure and transformation
details.
library("cdata")
transform <- rowrecs_to_blocks_spec(
wrapr::qchar_frame(
"measure", "value" |
"AUC" , AUC |
"R2" , R2 ),
recordKeys = "model_id")
print(transform)
#> {
#> row_record <- wrapr::qchar_frame(
#> "model_id" , "AUC", "R2" |
#> . , AUC , R2 )
#> row_keys <- c('model_id')
#>
#> # becomes
#>
#> block_record <- wrapr::qchar_frame(
#> "model_id" , "measure", "value" |
#> . , "AUC" , AUC |
#> . , "R2" , R2 )
#> block_keys <- c('model_id', 'measure')
#>
#> # args: c(checkNames = TRUE, checkKeys = FALSE, strict = FALSE, allow_rqdatatable = FALSE)
#> }
Once we have this specification we can transform the data using
operator notation.
We can collect the record blocks into rows by a “factor-out” (or
aggregation/projection) step.
1 |
AUC |
0.7 |
1 |
R2 |
0.4 |
2 |
AUC |
0.8 |
2 |
R2 |
0.5 |
d2 <- d %//% t(transform)
knitr::kable(d2)
# (or using general pipe notation
# includng the .() "execute immediately")
d %.>%
.(t(transform)) %.>%
knitr::kable(.)
We can expand record rows into blocks by a “multiplication” (or join)
step.
d3 <- d2 %**% transform
knitr::kable(d3)
1 |
AUC |
0.7 |
1 |
R2 |
0.4 |
2 |
AUC |
0.8 |
2 |
R2 |
0.5 |
# (or using general pipe notation)
d2 %.>%
transform %.>%
knitr::kable(.)
1 |
AUC |
0.7 |
1 |
R2 |
0.4 |
2 |
AUC |
0.8 |
2 |
R2 |
0.5 |
(%//%
and %**%
being two operators
introduced by the cdata
package.)
And the two specialized operators have an inverse/adjoint
relation.
1 |
AUC |
0.7 |
1 |
R2 |
0.4 |
2 |
AUC |
0.8 |
2 |
R2 |
0.5 |
# identity
d4 <- d %//% t(transform) %**% transform
knitr::kable(d4)
1 |
AUC |
0.7 |
1 |
R2 |
0.4 |
2 |
AUC |
0.8 |
2 |
R2 |
0.5 |
We can also pipe into the spec (and into its adjoint) using the wrapr
dot pipe operator
.
# reverse or adjoint/transpose operation specification
t_record_spec <- t(transform)
d %.>%
t_record_spec %.>%
knitr::kable(.)
# using dot-pipe's bquote style .() execute immediate notation
d %.>%
.(t(transform)) %.>%
knitr::kable(.)
# identity
d %.>%
.(t(transform)) %.>%
transform %.>%
knitr::kable(.)
1 |
AUC |
0.7 |
1 |
R2 |
0.4 |
2 |
AUC |
0.8 |
2 |
R2 |
0.5 |
And, of course, the exact same functionality for database tables.
have_db <- requireNamespace("DBI", quietly = TRUE) &&
requireNamespace("RSQLite", quietly = TRUE)
raw_connection <- DBI::dbConnect(RSQLite::SQLite(),
":memory:")
RSQLite::initExtension(raw_connection)
db <- rquery::rquery_db_info(
connection = raw_connection,
is_dbi = TRUE,
connection_options = rquery::rq_connection_tests(raw_connection))
d_td <- rquery::rq_copy_to(db, "d", d)
ops <- d_td %//% t(transform)
cat(format(ops))
#> mk_td("d", c(
#> "model_id",
#> "measure",
#> "value")) %.>%
#> non_sql_node(., CREATE TEMPORARY TABLE "OUT" AS SELECT a."model_id" "model_id", MAX( CASE WHEN CAST(a."measure" AS VARCHAR) = 'AUC' THEN a."value" ELSE NULL END ) "AUC", MAX( CASE WHEN CAST(a."measure" AS VARCHAR) = 'R2' THEN a."value" ELSE NULL END ) "R2" FROM "IN" a GROUP BY a."model_id")
rquery::execute(db, ops) %.>%
knitr::kable(.)
d_td %.>%
.(t(transform)) %.>%
rquery::execute(db, .) %.>%
knitr::kable(.)
DBI::dbDisconnect(raw_connection)