By hand: one file at a time
- Map: read one partition, keep employed
- Reduce: group_by per file
- lapply over files is the map step
Same pattern as the Python notebook, in R syntax.
map_partition <- function(path) {
df <- read_parquet(path,
col_select = c("dpto", "actividad", "factor_expansion"))
df[df$actividad == 1, c("dpto", "factor_expansion")]
}
reduce_part <- function(emp) {
emp %>% group_by(dpto) %>%
summarise(count = n(), sum = sum(factor_expansion))
}
parts <- lapply(files, \(f) reduce_part(map_partition(f)))
result <- bind_rows(parts)