Getting Started with rollup Package

rollup: A Tidy implementation of GROUPING SETS, WITH ROLLUP, and WITH CUBE, which are powerful extensions of the GROUP BY clause that compute multiple group-by clauses in a single statement in SQL. This package operates on top of the dplyr and performs the same functions as SQL.

Installation

# From CRAN
install.packages("rollup")
 
# From Github
library(devtools)
devtools::install_github("JuYoungAhn/rollup")

In a Nutshell

The functions of rollup package allow you to simplify multiple group_by operations into a single, concise statement.
This makes data aggregation easier and more efficient.

mtcars %>% group_by(vs, am) %>% grouping_sets("vs","am",c("vs","am"), NA) %>% 
  summarize(n=n(), avg_mpg=mean(mpg))
#> # A tibble: 9 × 4
#>      vs    am     n avg_mpg
#>   <dbl> <dbl> <int>   <dbl>
#> 1     0    NA    18    16.6
#> 2     1    NA    14    24.6
#> 3    NA     0    19    17.1
#> 4    NA     1    13    24.4
#> 5     0     0    12    15.0
#> 6     0     1     6    19.8
#> 7     1     0     7    20.7
#> 8     1     1     7    28.4
#> 9    NA    NA    32    20.1

mtcars %>% group_by(vs, am) %>% with_rollup() %>% 
  summarize(n=n(), avg_mpg=mean(mpg))
#> # A tibble: 7 × 4
#> # Groups:   vs [3]
#>      vs    am     n avg_mpg
#>   <dbl> <dbl> <int>   <dbl>
#> 1     0     0    12    15.0
#> 2     0     1     6    19.8
#> 3     1     0     7    20.7
#> 4     1     1     7    28.4
#> 5     0    NA    18    16.6
#> 6     1    NA    14    24.6
#> 7    NA    NA    32    20.1

mtcars %>% group_by(vs, am) %>% with_cube() %>% 
  summarize(n=n(), avg_mpg=mean(mpg))
#> # A tibble: 9 × 4
#>      vs    am     n avg_mpg
#>   <dbl> <dbl> <int>   <dbl>
#> 1     0    NA    18    16.6
#> 2     1    NA    14    24.6
#> 3    NA     0    19    17.1
#> 4    NA     1    13    24.4
#> 5     0     0    12    15.0
#> 6     0     1     6    19.8
#> 7     1     0     7    20.7
#> 8     1     1     7    28.4
#> 9    NA    NA    32    20.1

Practical example

This example shows how to compute the average pageview count grouped by various combinations of gender and age.
In this section, you will see why the rollup package is useful by exploring practical data examples.

Web service data

Description of data

date_id : yyyy-mm-dd
id : user unique id
gender : male(M), female(F)
age : age band (categorical)
page_view_cnt : pageview count of user on date_id
product_view_cnt_cat : decile category of the product view count for a user on date_id.

library(dplyr)
library(rollup)
data("web_service_data") # web_service_data of rollup package
web_service_data %>% head
#> # A tibble: 6 × 6
#>   date_id       id gender age   page_view_cnt product_view_cnt_cat
#>   <chr>      <dbl> <chr>  <fct>         <dbl> <fct>               
#> 1 2024-06-24    19 M      40                0 60%                 
#> 2 2024-06-24    34 M      40                5 70%                 
#> 3 2024-06-24    44 F      50               12 100%                
#> 4 2024-06-24    57 M      60               87 20%                 
#> 5 2024-06-24    65 F      50                1 100%                
#> 6 2024-06-24    86 F      40                3 90%

grouping_sets

grouping_sets() allows you to perform multiple group_by operations simultaneously, producing combined results in a single output.
grouping_sets('a') is equivalent to the single grouping set operation group_by(a).
grouping_sets('a','b') is equivalent to row binding of group_by(a) and group_by(b).
grouping_sets(c('a','b'),'a','b', NA) is equivalent to row binding of group_by(a,b), group_by(a), group_by(b) and without group_by operation.

library(tidyr)
# compute average of `page_view_cnt` group by "gender", "age", and "gender & age", along with the overall average. NA in the output table represents overall aggregates.
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>% 
  group_by(gender, age) %>% grouping_sets('gender', 'age', c('gender','age'), NA) %>% 
  summarize(avg_pv_cnt = mean(page_view_cnt))
#> # A tibble: 21 × 3
#>    gender age   avg_pv_cnt
#>    <chr>  <fct>      <dbl>
#>  1 F      NA          2.28
#>  2 M      NA          1.92
#>  3 NA     10          1.61
#>  4 NA     20          3.01
#>  5 NA     30          2.23
#>  6 NA     40          1.77
#>  7 NA     50          1.44
#>  8 NA     60          2.30
#>  9 F      10          2.33
#> 10 F      20          2.86
#> # ℹ 11 more rows

# compute average of `page_view_cnt` group by "gender & age & product_view_cnt_cat" along with the marginal average with regard to "product_view_cnt_cat".
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>% 
  group_by(gender, age, product_view_cnt_cat) %>% 
  grouping_sets('product_view_cnt_cat', c('product_view_cnt_cat', 'gender','age')) %>% 
  summarize(avg_pv_cnt = mean(page_view_cnt)) %>% 
  pivot_wider(names_from = product_view_cnt_cat, values_from = avg_pv_cnt)
#> # A tibble: 13 × 11
#>    gender age       X `20%` `40%` `50%` `60%` `70%` `80%` `90%` `100%`
#>    <chr>  <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
#>  1 NA     NA    1.46  1.84   2.02 2.31   2.72  2.89  2.8   3.79   2.82
#>  2 F      10    1.4   2      1.4  2.67   4    NA    NA     4     NA   
#>  3 F      20    0     3.5    2.08 2.29   3.83  2.57  3.45  4.83   2.25
#>  4 F      30    0.833 2.5    4.5  2.88   3     1.75  3.5   3      3.17
#>  5 F      40    1.33  1.9    2.7  2.2    1.22  3     3.38  4      2   
#>  6 F      50    0.462 1.5    2    2.5    1.2   4     2.5   5.33   3.5 
#>  7 F      60    1.19  1.71   1    1.33   3     3     1.5   2      3   
#>  8 M      10    0.375 0.833  1.14 3      1     0    NA    NA     NA   
#>  9 M      20    1.14  3.17   3.16 3.55   4.5   3    NA     3.5    7   
#> 10 M      30    0.824 1.62   1.31 2.7    3.38  2.5   1.86  3.5   NA   
#> 11 M      40    0.889 0.933  2.06 0.833  1.88  3.25  1.6   1.67  NA   
#> 12 M      50    0.562 1.07   1.06 2.6    2     0     0.5   0     NA   
#> 13 M      60    3.06  2.69   4    3.5    0     8     2     1     NA

with_cube

with_cube() automatically generates all possible combinations of specified variables in group_by clause.
with_cube() function is a simplified way of expressing grouping_sets().
with_cube() is equivalent to using grouping_sets() with all combinations of the specified columns.
For example, group_by(a,b,c) followed by with_cube() equals to grouping_sets(c('a','b','c'), c('a','b'), c('a','c'), c('b','c'), 'a', 'b', 'c', NA).
with_cube() is particularly useful when you want to include total aggregates of both rows and columns in a cross table.

# This produces a table with average page view counts grouped by gender and age, including total aggregates across all combinations.
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>% 
  group_by(gender, age) %>% with_cube() %>% 
  summarize(avg_pv_cnt = mean(page_view_cnt)) %>% 
  pivot_wider(names_from = age, values_from = avg_pv_cnt)
#> # A tibble: 3 × 8
#>   gender  `NA`  `10`  `20`  `30`  `40`  `50`  `60`
#>   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 F       2.28  2.33  2.86  2.67  2.33 2.24   1.48
#> 2 M       1.92  0.92  3.19  1.91  1.31 0.907  2.99
#> 3 NA      2.08  1.61  3.01  2.23  1.77 1.44   2.30

with_rollup

with_rollup() creates hierarchical aggregations by progressively reducing the number of grouping variables.
with_rollup() is particulary useful when variables have a hierarchy, because all possible combinations are not necessary.
group_by(a,b) followed by with_rollup() equals to grouping_sets(c('a','b'), 'a', NA).
group_by(a,b,c) followed by with_rollup() equals to grouping_sets(c('a','b','c'), ('a','b'), ('a'), NA).

# The variables "age_big" and "age" have a hierarchy. 
web_service_data_processed <- web_service_data %>% mutate(
  age_big = case_when(
    age %in% c(10,20,30) ~ 'young',
    age %in% c(40,50,60) ~ 'old'  
  )
)

# If there are aggregates "age_big & age", marginal aggregates for "age" are not necessary.
# The following code computes aggregates for "age_big & age", "age_big", and entire data set.
web_service_data_processed %>% group_by(age_big, age) %>% 
  with_rollup() %>% summarize(
  user_cnt = n_distinct(id),
  avg_pv_cnt = mean(page_view_cnt)
)
#> # A tibble: 9 × 4
#> # Groups:   age_big [3]
#>   age_big age   user_cnt avg_pv_cnt
#>   <chr>   <fct>    <int>      <dbl>
#> 1 old     40         196       2.52
#> 2 old     50         178       1.99
#> 3 old     60         204       2.32
#> 4 young   10         132       1.57
#> 5 young   20         140       3.69
#> 6 young   30         150       3.77
#> 7 old     NA         578       2.29
#> 8 young   NA         422       3.06
#> 9 NA      NA        1000       2.61

Ju Young Ahn

2024-08-27