# NOT RUN {
## A Simple Introduction --------------------------------------
head(iris)
collap(iris, ~ Species) # Default: FUN = fmean for numeric
collapv(iris, 5) # Same using collapv
collap(iris, ~ Species, fmedian) # Using the median
collap(iris, ~ Species, fmedian, keep.col.order = FALSE) # Groups in-front
collap(iris, Sepal.Width + Petal.Width ~ Species, fmedian) # Only '.Width' columns
collapv(iris, 5, cols = c(2, 4)) # Same using collapv
collap(iris, ~ Species, list(fmean, fmedian)) # Two functions
collap(iris, ~ Species, list(fmean, fmedian), return = "long") # Long format
collapv(iris, 5, custom = list(fmean = 1:2, fmedian = 3:4)) # Custom aggregation
collapv(iris, 5, custom = list(fmean = 1:2, fmedian = 3:4), # Raw output, no column reordering
return = "list")
collapv(iris, 5, custom = list(fmean = 1:2, fmedian = 3:4), # A strange choice..
return = "long")
collap(iris, ~ Species, w = ~ Sepal.Length) # Using Sepal.Length as weights, ..
weights <- abs(rnorm(fnrow(iris)))
collap(iris, ~ Species, w = weights) # Some random weights..
collap(iris, iris$Species, w = weights) # Note this behavior..
collap(iris, iris$Species, w = weights,
keep.by = FALSE, keep.w = FALSE)
# }
# NOT RUN {
<!-- % \donttest{iris |> fgroup_by(Species) |> collapg()} # dplyr style, but faster -->
# }
# NOT RUN {
## Multi-Type Aggregation --------------------------------------
head(wlddev) # World Development Panel Data
head(collap(wlddev, ~ country + decade)) # Aggregate by country and decade
head(collap(wlddev, ~ country + decade, fmedian, ffirst)) # Different functions
head(collap(wlddev, ~ country + decade, cols = is.numeric)) # Aggregate only numeric columns
head(collap(wlddev, ~ country + decade, cols = 9:13)) # Only the 5 series
head(collap(wlddev, PCGDP + LIFEEX ~ country + decade)) # Only GDP and life-expactancy
head(collap(wlddev, PCGDP + LIFEEX ~ country + decade, fsum)) # Using the sum instead
head(collap(wlddev, PCGDP + LIFEEX ~ country + decade, sum, # Same using base::sum -> slower!
na.rm = TRUE))
head(collap(wlddev, wlddev[c("country","decade")], fsum, # Same, exploring different inputs
cols = 9:10))
head(collap(wlddev[9:10], wlddev[c("country","decade")], fsum))
head(collapv(wlddev, c("country","decade"), fsum)) # ..names/indices with collapv
head(collapv(wlddev, c(1,5), fsum))
g <- GRP(wlddev, ~ country + decade) # Precomputing the grouping
head(collap(wlddev, g, keep.by = FALSE)) # This is slightly faster now
# Aggregate categorical data using not the mode but the last element
head(collap(wlddev, ~ country + decade, fmean, flast))
head(collap(wlddev, ~ country + decade, catFUN = flast, # Aggregate only categorical data
cols = is_categorical))
## Weighted Aggregation ----------------------------------------
# We aggregate to region level using population weights
head(collap(wlddev, ~ region + year, w = ~ POP)) # Takes weighted mean for numeric..
# ..and weighted mode for categorical data. The weight vector is aggregated using fsum
head(collap(wlddev, ~ region + year, w = ~ POP, # Aggregating weights using sum
wFUN = list(fsum, fmax))) # and max (corresponding to mode)
## Multi-Function Aggregation ----------------------------------
head(collap(wlddev, ~ country + decade, list(fmean, fnobs), # Saving mean and Nobs
cols = 9:13))
head(collap(wlddev, ~ country + decade, # Same using base R -> slower
list(mean = mean,
Nobs = function(x,
# }
# NOT RUN {
…
# }
# NOT RUN {
) sum(!is.na(x))),
cols = 9:13, na.rm = TRUE))
lapply(collap(wlddev, ~ country + decade, # List output format
list(fmean, fnobs), cols = 9:13, return = "list"), head)
head(collap(wlddev, ~ country + decade, # Long output format
list(fmean, fnobs), cols = 9:13, return = "long"))
head(collap(wlddev, ~ country + decade, # Also aggregating categorical data,
list(fmean, fnobs), return = "long_dupl")) # and duplicating it 2 times
head(collap(wlddev, ~ country + decade, # Now also using 2 functions on
list(fmean, fnobs), list(fmode, flast), # categorical data
keep.col.order = FALSE))
head(collap(wlddev, ~ country + decade, # More functions, string input,
c("fmean","fsum","fnobs","fsd","fvar"), # parallelized execution
c("fmode","ffirst","flast","fndistinct"), # (choose more than 1 cores,
parallel = TRUE, mc.cores = 1L, # depending on your machine)
keep.col.order = FALSE))
## Custom Aggregation ------------------------------------------
head(collap(wlddev, ~ country + decade, # Custom aggregation
custom = list(fmean = 9:13, fsd = 9:10, fmode = 7:8)))
head(collap(wlddev, ~ country + decade, # Using column names
custom = list(fmean = "PCGDP", fsd = c("LIFEEX","GINI"),
flast = "date")))
head(collap(wlddev, ~ country + decade, # Weighted parallelized custom
custom = list(fmean = 9:12, fsd = 9:10, # aggregation
fmode = 7:8), w = ~ POP,
wFUN = list(fsum, fmax),
parallel = TRUE, mc.cores = 1L))
head(collap(wlddev, ~ country + decade, # No column reordering
custom = list(fmean = 9:12, fsd = 9:10,
fmode = 7:8), w = ~ POP,
wFUN = list(fsum, fmax),
parallel = TRUE, mc.cores = 1L, keep.col.order = FALSE))
# }
# NOT RUN {
<!-- % \donttest{ -->
# }
# NOT RUN {
## Piped Use --------------------------------------------------
library(magrittr) # Note: Used because |> is not available on older R versions
iris %>% fgroup_by(Species) %>% collapg()
wlddev %>% fgroup_by(country, decade) %>% collapg() %>% head()
wlddev %>% fgroup_by(region, year) %>% collapg(w = POP) %>% head()
wlddev %>% fgroup_by(country, decade) %>% collapg(fmedian, flast) %>% head()
wlddev %>% fgroup_by(country, decade) %>%
collapg(custom = list(fmean = 9:12, fmode = 5:7, flast = 3)) %>% head()
# }
# NOT RUN {
<!-- % } -->
# }
Run the code above in your browser using DataLab