tutorial.Rmdrollup: A Tidy implementation of
GROUPING SETS, WITH ROLLUP, and
WITH CUBE, which are powerful extensions of the
GROUP BY clause that compute multiple group-by clauses in a
single statement in SQL. This package operates on top of
the dplyr and performs the same functions as
SQL.
# From CRAN
install.packages("rollup")
# From Github
library(devtools)
devtools::install_github("JuYoungAhn/rollup")
library(rollup)rollup package allow you to simplify
multiple group_by operations into a single, concise
statement.
mtcars %>% group_by(vs, am) %>% grouping_sets("vs","am",c("vs","am"), NA) %>%
summarize(n=n(), avg_mpg=mean(mpg))
#> # A tibble: 9 × 4
#> vs am n avg_mpg
#> <chr> <chr> <int> <dbl>
#> 1 0 NA 18 16.6
#> 2 1 NA 14 24.6
#> 3 NA 0 19 17.1
#> 4 NA 1 13 24.4
#> 5 0 0 12 15.0
#> 6 0 1 6 19.8
#> 7 1 0 7 20.7
#> 8 1 1 7 28.4
#> 9 NA NA 32 20.1
mtcars %>% group_by(vs, am) %>% with_rollup() %>%
summarize(n=n(), avg_mpg=mean(mpg))
#> # A tibble: 7 × 4
#> vs am n avg_mpg
#> <chr> <chr> <int> <dbl>
#> 1 0 0 12 15.0
#> 2 0 1 6 19.8
#> 3 1 0 7 20.7
#> 4 1 1 7 28.4
#> 5 0 NA 18 16.6
#> 6 1 NA 14 24.6
#> 7 NA NA 32 20.1
mtcars %>% group_by(vs, am) %>% with_cube() %>%
summarize(n=n(), avg_mpg=mean(mpg))
#> # A tibble: 9 × 4
#> vs am n avg_mpg
#> <chr> <chr> <int> <dbl>
#> 1 0 NA 18 16.6
#> 2 1 NA 14 24.6
#> 3 NA 0 19 17.1
#> 4 NA 1 13 24.4
#> 5 0 0 12 15.0
#> 6 0 1 6 19.8
#> 7 1 0 7 20.7
#> 8 1 1 7 28.4
#> 9 NA NA 32 20.1By default, total/subtotal rows have NA values in
grouping columns. You can replace these with custom labels using the
total_label parameter:
# Replace NA with "Total" in total rows
mtcars %>% group_by(vs, am) %>%
with_rollup(total_label = "Total") %>%
summarize(n=n(), avg_mpg=mean(mpg))
#> # A tibble: 7 × 4
#> vs am n avg_mpg
#> <chr> <chr> <int> <dbl>
#> 1 0 0 12 15.0
#> 2 0 1 6 19.8
#> 3 1 0 7 20.7
#> 4 1 1 7 28.4
#> 5 0 Total 18 16.6
#> 6 1 Total 14 24.6
#> 7 Total Total 32 20.1Use total_on_top = TRUE to display total/subtotal rows
at the beginning of the result:
# Total rows appear first
mtcars %>% group_by(vs, am) %>%
with_cube(total_label = "Grand Total", total_on_top = TRUE) %>%
summarize(n=n(), avg_mpg=mean(mpg))
#> # A tibble: 9 × 4
#> vs am n avg_mpg
#> <chr> <chr> <int> <dbl>
#> 1 Grand Total Grand Total 32 20.1
#> 2 0 Grand Total 18 16.6
#> 3 1 Grand Total 14 24.6
#> 4 Grand Total 0 19 17.1
#> 5 Grand Total 1 13 24.4
#> 6 0 0 12 15.0
#> 7 0 1 6 19.8
#> 8 1 0 7 20.7
#> 9 1 1 7 28.4These parameters work with grouping_sets(),
with_rollup(), and with_cube() functions.
library(dplyr)
library(rollup)
data("web_service_data") # web_service_data of rollup package
web_service_data %>% head
#> # A tibble: 6 × 6
#> date_id id gender age page_view_cnt product_view_cnt_cat
#> <chr> <dbl> <chr> <fct> <dbl> <fct>
#> 1 2024-06-24 19 M 40 0 60%
#> 2 2024-06-24 34 M 40 5 70%
#> 3 2024-06-24 44 F 50 12 100%
#> 4 2024-06-24 57 M 60 87 20%
#> 5 2024-06-24 65 F 50 1 100%
#> 6 2024-06-24 86 F 40 3 90%grouping_sets() allows you to perform multiple
group_by operations simultaneously, producing combined
results in a single output.grouping_sets('a') is equivalent to the single grouping
set operation group_by(a).grouping_sets('a','b') is equivalent to row binding of
group_by(a) and group_by(b).grouping_sets(c('a','b'),'a','b', NA) is equivalent to
row binding of group_by(a,b), group_by(a),
group_by(b) and without group_by
operation.
library(tidyr)
# compute average of `page_view_cnt` group by "gender", "age", and "gender & age", along with the overall average. NA in the output table represents overall aggregates.
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age) %>% grouping_sets('gender', 'age', c('gender','age'), NA) %>%
summarize(avg_pv_cnt = mean(page_view_cnt))
#> # A tibble: 21 × 3
#> gender age avg_pv_cnt
#> <chr> <chr> <dbl>
#> 1 F NA 2.28
#> 2 M NA 1.92
#> 3 NA 10 1.61
#> 4 NA 20 3.01
#> 5 NA 30 2.23
#> 6 NA 40 1.77
#> 7 NA 50 1.44
#> 8 NA 60 2.30
#> 9 F 10 2.33
#> 10 F 20 2.86
#> # ℹ 11 more rows
# compute average of `page_view_cnt` group by "gender & age & product_view_cnt_cat" along with the marginal average with regard to "product_view_cnt_cat".
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age, product_view_cnt_cat) %>%
grouping_sets('product_view_cnt_cat', c('product_view_cnt_cat', 'gender','age')) %>%
summarize(avg_pv_cnt = mean(page_view_cnt)) %>%
pivot_wider(names_from = product_view_cnt_cat, values_from = avg_pv_cnt)
#> # A tibble: 13 × 11
#> gender age `100%` `20%` `40%` `50%` `60%` `70%` `80%` `90%` X
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 NA NA 2.82 1.84 2.02 2.31 2.72 2.89 2.8 3.79 1.46
#> 2 F 20 2.25 3.5 2.08 2.29 3.83 2.57 3.45 4.83 0
#> 3 F 30 3.17 2.5 4.5 2.88 3 1.75 3.5 3 0.833
#> 4 F 40 2 1.9 2.7 2.2 1.22 3 3.38 4 1.33
#> 5 F 50 3.5 1.5 2 2.5 1.2 4 2.5 5.33 0.462
#> 6 F 60 3 1.71 1 1.33 3 3 1.5 2 1.19
#> 7 M 20 7 3.17 3.16 3.55 4.5 3 NA 3.5 1.14
#> 8 F 10 NA 2 1.4 2.67 4 NA NA 4 1.4
#> 9 M 10 NA 0.833 1.14 3 1 0 NA NA 0.375
#> 10 M 30 NA 1.62 1.31 2.7 3.38 2.5 1.86 3.5 0.824
#> 11 M 40 NA 0.933 2.06 0.833 1.88 3.25 1.6 1.67 0.889
#> 12 M 50 NA 1.07 1.06 2.6 2 0 0.5 0 0.562
#> 13 M 60 NA 2.69 4 3.5 0 8 2 1 3.06with_cube() automatically generates all possible
combinations of specified variables in group_by
clause.with_cube() function is a simplified way of expressing
grouping_sets().with_cube() is equivalent to using
grouping_sets() with all combinations of the specified
columns.group_by(a,b,c) followed by
with_cube() equals to
grouping_sets(c('a','b','c'), c('a','b'), c('a','c'), c('b','c'), 'a', 'b', 'c', NA).with_cube() is particularly useful when you want to
include total aggregates of both rows and columns in a cross table.
# This produces a table with average page view counts grouped by gender and age, including total aggregates across all combinations.
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age) %>% with_cube() %>%
summarize(avg_pv_cnt = mean(page_view_cnt)) %>%
pivot_wider(names_from = age, values_from = avg_pv_cnt)
#> # A tibble: 3 × 8
#> gender `NA` `10` `20` `30` `40` `50` `60`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 F 2.28 2.33 2.86 2.67 2.33 2.24 1.48
#> 2 M 1.92 0.92 3.19 1.91 1.31 0.907 2.99
#> 3 NA 2.08 1.61 3.01 2.23 1.77 1.44 2.30with_rollup() creates hierarchical aggregations by
progressively reducing the number of grouping variables.with_rollup() is particulary useful when
variables have a hierarchy, because all possible combinations
are not necessary.group_by(a,b) followed by with_rollup()
equals to grouping_sets(c('a','b'), 'a', NA).group_by(a,b,c) followed by with_rollup()
equals to
grouping_sets(c('a','b','c'), ('a','b'), ('a'), NA).
# The variables "age_big" and "age" have a hierarchy.
web_service_data_processed <- web_service_data %>% mutate(
age_big = case_when(
age %in% c(10,20,30) ~ 'young',
age %in% c(40,50,60) ~ 'old'
)
)
# If there are aggregates "age_big & age", marginal aggregates for "age" are not necessary.
# The following code computes aggregates for "age_big & age", "age_big", and entire data set.
web_service_data_processed %>% group_by(age_big, age) %>%
with_rollup() %>% summarize(
user_cnt = n_distinct(id),
avg_pv_cnt = mean(page_view_cnt)
)
#> # A tibble: 9 × 4
#> age_big age user_cnt avg_pv_cnt
#> <chr> <chr> <int> <dbl>
#> 1 old 40 196 2.52
#> 2 old 50 178 1.99
#> 3 old 60 204 2.32
#> 4 young 10 132 1.57
#> 5 young 20 140 3.69
#> 6 young 30 150 3.77
#> 7 old NA 578 2.29
#> 8 young NA 422 3.06
#> 9 NA NA 1000 2.61