rollup

R package

rollup : multiple group_by operations into a single, concise statement

Ju Young Ahn juyoungahn.github.io
2024-08-26

Installation

# From CRAN
install.packages("rollup")
 
# From Github
library(devtools)
devtools::install_github("JuYoungAhn/rollup")

In a Nutshell

library(dplyr)
library(rollup)
mtcars %>% group_by(vs, am) %>% grouping_sets("vs","am",c("vs","am"), NA) %>% 
  summarize(n=n(), avg_mpg=mean(mpg))
# A tibble: 9 × 4
     vs    am     n avg_mpg
  <dbl> <dbl> <int>   <dbl>
1     0    NA    18    16.6
2     1    NA    14    24.6
3    NA     0    19    17.1
4    NA     1    13    24.4
5     0     0    12    15.0
6     0     1     6    19.8
7     1     0     7    20.7
8     1     1     7    28.4
9    NA    NA    32    20.1
mtcars %>% group_by(vs, am) %>% with_rollup() %>% 
  summarize(n=n(), avg_mpg=mean(mpg))
# A tibble: 7 × 4
# Groups:   vs [3]
     vs    am     n avg_mpg
  <dbl> <dbl> <int>   <dbl>
1     0     0    12    15.0
2     0     1     6    19.8
3     1     0     7    20.7
4     1     1     7    28.4
5     0    NA    18    16.6
6     1    NA    14    24.6
7    NA    NA    32    20.1
mtcars %>% group_by(vs, am) %>% with_cube() %>% 
  summarize(n=n(), avg_mpg=mean(mpg))
# A tibble: 9 × 4
     vs    am     n avg_mpg
  <dbl> <dbl> <int>   <dbl>
1     0    NA    18    16.6
2     1    NA    14    24.6
3    NA     0    19    17.1
4    NA     1    13    24.4
5     0     0    12    15.0
6     0     1     6    19.8
7     1     0     7    20.7
8     1     1     7    28.4
9    NA    NA    32    20.1

Practical example

This example shows how to compute the average pageview count grouped by various combinations of gender and age. In this section, you will see why the rollup package is useful by exploring practical data examples.

Web service data

library(dplyr)
library(rollup)
data("web_service_data") # web_service_data of rollup package

# Description
# - date_id : yyyy-mm-dd
# - id : user unique id
# - gender : male(M), female(F)
# - age : age band (categorical)
# - page_view_cnt : pageview count of user on date_id
# - product_view_cnt_cat : decile category of the product view count for a user on date_id.
web_service_data %>% head
# A tibble: 6 × 6
  date_id       id gender age   page_view_cnt product_view_cnt_cat
  <chr>      <dbl> <chr>  <fct>         <dbl> <fct>               
1 2024-06-24    19 M      40                0 60%                 
2 2024-06-24    34 M      40                5 70%                 
3 2024-06-24    44 F      50               12 100%                
4 2024-06-24    57 M      60               87 20%                 
5 2024-06-24    65 F      50                1 100%                
6 2024-06-24    86 F      40                3 90%                 

grouping_sets

library(tidyr)
# compute average of `page_view_cnt` group by "gender", "age", and "gender & age", along with the overall average. NA in the output table represents overall aggregates.
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>% 
  group_by(gender, age) %>% grouping_sets('gender', 'age', c('gender','age'), NA) %>% 
  summarize(avg_pv_cnt = mean(page_view_cnt))
# A tibble: 21 × 3
   gender age   avg_pv_cnt
   <chr>  <fct>      <dbl>
 1 F      <NA>        2.28
 2 M      <NA>        1.92
 3 <NA>   10          1.61
 4 <NA>   20          3.01
 5 <NA>   30          2.23
 6 <NA>   40          1.77
 7 <NA>   50          1.44
 8 <NA>   60          2.30
 9 F      10          2.33
10 F      20          2.86
# ℹ 11 more rows
# compute average of `page_view_cnt` group by "gender & age & product_view_cnt_cat" along with the marginal average with regard to "product_view_cnt_cat".
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>% 
  group_by(gender, age, product_view_cnt_cat) %>% 
  grouping_sets('product_view_cnt_cat', c('product_view_cnt_cat', 'gender','age')) %>% 
  summarize(avg_pv_cnt = mean(page_view_cnt)) %>% 
  pivot_wider(names_from = product_view_cnt_cat, values_from = avg_pv_cnt)
# A tibble: 13 × 11
   gender age       X `20%` `40%` `50%` `60%` `70%` `80%` `90%` `100%`
   <chr>  <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
 1 <NA>   <NA>  1.46  1.84   2.02 2.31   2.72  2.89  2.8   3.79   2.82
 2 F      10    1.4   2      1.4  2.67   4    NA    NA     4     NA   
 3 F      20    0     3.5    2.08 2.29   3.83  2.57  3.45  4.83   2.25
 4 F      30    0.833 2.5    4.5  2.88   3     1.75  3.5   3      3.17
 5 F      40    1.33  1.9    2.7  2.2    1.22  3     3.38  4      2   
 6 F      50    0.462 1.5    2    2.5    1.2   4     2.5   5.33   3.5 
 7 F      60    1.19  1.71   1    1.33   3     3     1.5   2      3   
 8 M      10    0.375 0.833  1.14 3      1     0    NA    NA     NA   
 9 M      20    1.14  3.17   3.16 3.55   4.5   3    NA     3.5    7   
10 M      30    0.824 1.62   1.31 2.7    3.38  2.5   1.86  3.5   NA   
11 M      40    0.889 0.933  2.06 0.833  1.88  3.25  1.6   1.67  NA   
12 M      50    0.562 1.07   1.06 2.6    2     0     0.5   0     NA   
13 M      60    3.06  2.69   4    3.5    0     8     2     1     NA   

with_cube

# This produces a table with average page view counts grouped by gender and age, including total aggregates across all combinations.
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>% 
  group_by(gender, age) %>% with_cube() %>% 
  summarize(avg_pv_cnt = mean(page_view_cnt)) %>% 
  pivot_wider(names_from = age, values_from = avg_pv_cnt)
# A tibble: 3 × 8
  gender  `NA`  `10`  `20`  `30`  `40`  `50`  `60`
  <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 F       2.28  2.33  2.86  2.67  2.33 2.24   1.48
2 M       1.92  0.92  3.19  1.91  1.31 0.907  2.99
3 <NA>    2.08  1.61  3.01  2.23  1.77 1.44   2.30

with_rollup

# The variables "age_big" and "age" have a hierarchy. 
web_service_data_processed <- web_service_data %>% mutate(
  age_big = case_when(
    age %in% c(10,20,30) ~ 'young',
    age %in% c(40,50,60) ~ 'old'  
  )
)

# If there are aggregates "age_big & age", marginal aggregates for "age" are not necessary.
# The following code computes aggregates for "age_big & age", "age_big", and entire data set.
web_service_data_processed %>% group_by(age_big, age) %>% 
  with_rollup() %>% summarize(
  user_cnt = n_distinct(id),
  avg_pv_cnt = mean(page_view_cnt)
)
# A tibble: 9 × 4
# Groups:   age_big [3]
  age_big age   user_cnt avg_pv_cnt
  <chr>   <fct>    <int>      <dbl>
1 old     40         196       2.52
2 old     50         178       1.99
3 old     60         204       2.32
4 young   10         132       1.57
5 young   20         140       3.69
6 young   30         150       3.77
7 old     <NA>       578       2.29
8 young   <NA>       422       3.06
9 <NA>    <NA>      1000       2.61