rollup : multiple group_by
operations into a single, concise statement
# From CRAN
install.packages("rollup")
# From Github
library(devtools)
devtools::install_github("JuYoungAhn/rollup")
rollup
package allow you to simplify multiple group_by
operations into a single, concise statement.library(dplyr)
library(rollup)
mtcars %>% group_by(vs, am) %>% grouping_sets("vs","am",c("vs","am"), NA) %>%
summarize(n=n(), avg_mpg=mean(mpg))
# A tibble: 9 × 4
vs am n avg_mpg
<dbl> <dbl> <int> <dbl>
1 0 NA 18 16.6
2 1 NA 14 24.6
3 NA 0 19 17.1
4 NA 1 13 24.4
5 0 0 12 15.0
6 0 1 6 19.8
7 1 0 7 20.7
8 1 1 7 28.4
9 NA NA 32 20.1
# A tibble: 7 × 4
# Groups: vs [3]
vs am n avg_mpg
<dbl> <dbl> <int> <dbl>
1 0 0 12 15.0
2 0 1 6 19.8
3 1 0 7 20.7
4 1 1 7 28.4
5 0 NA 18 16.6
6 1 NA 14 24.6
7 NA NA 32 20.1
# A tibble: 9 × 4
vs am n avg_mpg
<dbl> <dbl> <int> <dbl>
1 0 NA 18 16.6
2 1 NA 14 24.6
3 NA 0 19 17.1
4 NA 1 13 24.4
5 0 0 12 15.0
6 0 1 6 19.8
7 1 0 7 20.7
8 1 1 7 28.4
9 NA NA 32 20.1
This example shows how to compute the average pageview count grouped by various combinations of gender and age. In this section, you will see why the rollup package is useful by exploring practical data examples.
library(dplyr)
library(rollup)
data("web_service_data") # web_service_data of rollup package
# Description
# - date_id : yyyy-mm-dd
# - id : user unique id
# - gender : male(M), female(F)
# - age : age band (categorical)
# - page_view_cnt : pageview count of user on date_id
# - product_view_cnt_cat : decile category of the product view count for a user on date_id.
web_service_data %>% head
# A tibble: 6 × 6
date_id id gender age page_view_cnt product_view_cnt_cat
<chr> <dbl> <chr> <fct> <dbl> <fct>
1 2024-06-24 19 M 40 0 60%
2 2024-06-24 34 M 40 5 70%
3 2024-06-24 44 F 50 12 100%
4 2024-06-24 57 M 60 87 20%
5 2024-06-24 65 F 50 1 100%
6 2024-06-24 86 F 40 3 90%
grouping_sets()
allows you to perform multiple group_by
operations simultaneously, producing combined results in a single output.grouping_sets('a')
is equivalent to the single grouping set operation group_by(a)
.grouping_sets('a','b')
is equivalent to row binding of group_by(a)
and group_by(b)
.grouping_sets(c('a','b'),'a','b', NA)
is equivalent to row binding of group_by(a,b)
, group_by(a)
, group_by(b)
and without group_by
operation.library(tidyr)
# compute average of `page_view_cnt` group by "gender", "age", and "gender & age", along with the overall average. NA in the output table represents overall aggregates.
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age) %>% grouping_sets('gender', 'age', c('gender','age'), NA) %>%
summarize(avg_pv_cnt = mean(page_view_cnt))
# A tibble: 21 × 3
gender age avg_pv_cnt
<chr> <fct> <dbl>
1 F <NA> 2.28
2 M <NA> 1.92
3 <NA> 10 1.61
4 <NA> 20 3.01
5 <NA> 30 2.23
6 <NA> 40 1.77
7 <NA> 50 1.44
8 <NA> 60 2.30
9 F 10 2.33
10 F 20 2.86
# ℹ 11 more rows
# compute average of `page_view_cnt` group by "gender & age & product_view_cnt_cat" along with the marginal average with regard to "product_view_cnt_cat".
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age, product_view_cnt_cat) %>%
grouping_sets('product_view_cnt_cat', c('product_view_cnt_cat', 'gender','age')) %>%
summarize(avg_pv_cnt = mean(page_view_cnt)) %>%
pivot_wider(names_from = product_view_cnt_cat, values_from = avg_pv_cnt)
# A tibble: 13 × 11
gender age X `20%` `40%` `50%` `60%` `70%` `80%` `90%` `100%`
<chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 <NA> <NA> 1.46 1.84 2.02 2.31 2.72 2.89 2.8 3.79 2.82
2 F 10 1.4 2 1.4 2.67 4 NA NA 4 NA
3 F 20 0 3.5 2.08 2.29 3.83 2.57 3.45 4.83 2.25
4 F 30 0.833 2.5 4.5 2.88 3 1.75 3.5 3 3.17
5 F 40 1.33 1.9 2.7 2.2 1.22 3 3.38 4 2
6 F 50 0.462 1.5 2 2.5 1.2 4 2.5 5.33 3.5
7 F 60 1.19 1.71 1 1.33 3 3 1.5 2 3
8 M 10 0.375 0.833 1.14 3 1 0 NA NA NA
9 M 20 1.14 3.17 3.16 3.55 4.5 3 NA 3.5 7
10 M 30 0.824 1.62 1.31 2.7 3.38 2.5 1.86 3.5 NA
11 M 40 0.889 0.933 2.06 0.833 1.88 3.25 1.6 1.67 NA
12 M 50 0.562 1.07 1.06 2.6 2 0 0.5 0 NA
13 M 60 3.06 2.69 4 3.5 0 8 2 1 NA
with_cube()
automatically generates all possible combinations of specified variables in group_by
clause.with_cube()
function is a simplified way of expressing grouping_sets()
.with_cube()
is equivalent to using grouping_sets()
with all combinations of the specified columns.group_by(a,b,c)
followed by with_cube()
equals to grouping_sets(c('a','b','c'), c('a','b'), c('a','c'), c('b','c'), 'a', 'b', 'c', NA)
.with_cube()
is particularly useful when you want to include total aggregates of both rows and columns in a cross table.# This produces a table with average page view counts grouped by gender and age, including total aggregates across all combinations.
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age) %>% with_cube() %>%
summarize(avg_pv_cnt = mean(page_view_cnt)) %>%
pivot_wider(names_from = age, values_from = avg_pv_cnt)
# A tibble: 3 × 8
gender `NA` `10` `20` `30` `40` `50` `60`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 F 2.28 2.33 2.86 2.67 2.33 2.24 1.48
2 M 1.92 0.92 3.19 1.91 1.31 0.907 2.99
3 <NA> 2.08 1.61 3.01 2.23 1.77 1.44 2.30
with_rollup()
creates hierarchical aggregations by progressively reducing the number of grouping variables.with_rollup()
is particulary useful when variables have a hierarchy, because all possible combinations are not necessary.group_by(a,b)
followed by with_rollup()
equals to grouping_sets(c('a','b'), 'a', NA)
.group_by(a,b,c)
followed by with_rollup()
equals to grouping_sets(c('a','b','c'), ('a','b'), ('a'), NA)
.# The variables "age_big" and "age" have a hierarchy.
web_service_data_processed <- web_service_data %>% mutate(
age_big = case_when(
age %in% c(10,20,30) ~ 'young',
age %in% c(40,50,60) ~ 'old'
)
)
# If there are aggregates "age_big & age", marginal aggregates for "age" are not necessary.
# The following code computes aggregates for "age_big & age", "age_big", and entire data set.
web_service_data_processed %>% group_by(age_big, age) %>%
with_rollup() %>% summarize(
user_cnt = n_distinct(id),
avg_pv_cnt = mean(page_view_cnt)
)
# A tibble: 9 × 4
# Groups: age_big [3]
age_big age user_cnt avg_pv_cnt
<chr> <fct> <int> <dbl>
1 old 40 196 2.52
2 old 50 178 1.99
3 old 60 204 2.32
4 young 10 132 1.57
5 young 20 140 3.69
6 young 30 150 3.77
7 old <NA> 578 2.29
8 young <NA> 422 3.06
9 <NA> <NA> 1000 2.61