## A Simple Introduction --------------------------------------
head(iris)
collap(iris, ~ Species) # Default: FUN = fmean for numeric
collapv(iris, 5) # Same using collapv
collap(iris, ~ Species, fmedian) # Using the median
collap(iris, ~ Species, fmedian, keep.col.order = FALSE) # Groups in-front
collap(iris, Sepal.Width + Petal.Width ~ Species, fmedian) # Only '.Width' columns
collapv(iris, 5, cols = c(2, 4)) # Same using collapv
collap(iris, ~ Species, list(fmean, fmedian)) # Two functions
collap(iris, ~ Species, list(fmean, fmedian), return = "long") # Long format
collapv(iris, 5, custom = list(fmean = 1:2, fmedian = 3:4)) # Custom aggregation
collapv(iris, 5, custom = list(fmean = 1:2, fmedian = 3:4), # Raw output, no column reordering
return = "list")
collapv(iris, 5, custom = list(fmean = 1:2, fmedian = 3:4), # A strange choice..
return = "long")
collap(iris, ~ Species, w = ~ Sepal.Length) # Using Sepal.Length as weights, ..
weights <- abs(rnorm(fnrow(iris)))
collap(iris, ~ Species, w = weights) # Some random weights..
collap(iris, iris$Species, w = weights) # Note this behavior..
collap(iris, iris$Species, w = weights,
keep.by = FALSE, keep.w = FALSE)
## Multi-Type Aggregation --------------------------------------
head(wlddev) # World Development Panel Data
head(collap(wlddev, ~ country + decade)) # Aggregate by country and decade
head(collap(wlddev, ~ country + decade, fmedian, ffirst)) # Different functions
head(collap(wlddev, ~ country + decade, cols = is.numeric)) # Aggregate only numeric columns
head(collap(wlddev, ~ country + decade, cols = 9:13)) # Only the 5 series
head(collap(wlddev, PCGDP + LIFEEX ~ country + decade)) # Only GDP and life-expactancy
head(collap(wlddev, PCGDP + LIFEEX ~ country + decade, fsum)) # Using the sum instead
head(collap(wlddev, PCGDP + LIFEEX ~ country + decade, sum, # Same using base::sum -> slower!
na.rm = TRUE))
head(collap(wlddev, wlddev[c("country","decade")], fsum, # Same, exploring different inputs
cols = 9:10))
head(collap(wlddev[9:10], wlddev[c("country","decade")], fsum))
head(collapv(wlddev, c("country","decade"), fsum)) # ..names/indices with collapv
head(collapv(wlddev, c(1,5), fsum))
g <- GRP(wlddev, ~ country + decade) # Precomputing the grouping
head(collap(wlddev, g, keep.by = FALSE)) # This is slightly faster now
# Aggregate categorical data using not the mode but the last element
head(collap(wlddev, ~ country + decade, fmean, flast))
head(collap(wlddev, ~ country + decade, catFUN = flast, # Aggregate only categorical data
cols = is_categorical))
## Weighted Aggregation ----------------------------------------
# We aggregate to region level using population weights
head(collap(wlddev, ~ region + year, w = ~ POP)) # Takes weighted mean for numeric..
# ..and weighted mode for categorical data. The weight vector is aggregated using fsum
head(collap(wlddev, ~ region + year, w = ~ POP, # Aggregating weights using sum
wFUN = list(sum = fsum, max = fmax))) # and max (corresponding to mode)
## Multi-Function Aggregation ----------------------------------
head(collap(wlddev, ~ country + decade, list(mean = fmean, N = fnobs), # Saving mean and Nobs
cols = 9:13))
head(collap(wlddev, ~ country + decade, # Same using base R -> slower
list(mean = mean,
N = function(x, ...) sum(!is.na(x))),
cols = 9:13, na.rm = TRUE))
lapply(collap(wlddev, ~ country + decade, # List output format
list(mean = fmean, N = fnobs), cols = 9:13, return = "list"), head)
head(collap(wlddev, ~ country + decade, # Long output format
list(mean = fmean, N = fnobs), cols = 9:13, return = "long"))
head(collap(wlddev, ~ country + decade, # Also aggregating categorical data,
list(mean = fmean, N = fnobs), return = "long_dupl")) # and duplicating it 2 times
head(collap(wlddev, ~ country + decade, # Now also using 2 functions on
list(mean = fmean, N = fnobs), list(mode = fmode, last = flast), # categorical data
keep.col.order = FALSE))
head(collap(wlddev, ~ country + decade, # More functions, string input,
c("fmean","fsum","fnobs","fsd","fvar"), # parallelized execution
c("fmode","ffirst","flast","fndistinct"), # (choose more than 1 cores,
parallel = TRUE, mc.cores = 1L, # depending on your machine)
keep.col.order = FALSE))
## Custom Aggregation ------------------------------------------
head(collap(wlddev, ~ country + decade, # Custom aggregation
custom = list(fmean = 11:13, fsd = 9:10, fmode = 7:8)))
head(collap(wlddev, ~ country + decade, # Using column names
custom = list(fmean = "PCGDP", fsd = c("LIFEEX","GINI"),
flast = "date")))
head(collap(wlddev, ~ country + decade, # Weighted parallelized custom
custom = list(fmean = 9:12, fsd = 9:10, # aggregation
fmode = 7:8), w = ~ POP,
wFUN = list(fsum, fmax),
parallel = TRUE, mc.cores = 1L))
head(collap(wlddev, ~ country + decade, # No column reordering
custom = list(fmean = 9:12, fsd = 9:10,
fmode = 7:8), w = ~ POP,
wFUN = list(fsum, fmax),
parallel = TRUE, mc.cores = 1L, keep.col.order = FALSE))
## Piped Use --------------------------------------------------
iris |> fgroup_by(Species) |> collapg()
wlddev |> fgroup_by(country, decade) |> collapg() |> head()
wlddev |> fgroup_by(region, year) |> collapg(w = POP) |> head()
wlddev |> fgroup_by(country, decade) |> collapg(fmedian, flast) |> head()
wlddev |> fgroup_by(country, decade) |>
collapg(custom = list(fmean = 9:12, fmode = 5:7, flast = 3)) |> head()
Run the code above in your browser using DataLab