library(cheapr)
library(bench)
# A use-case for data.table
# Adding 0 because can't update ALTREP by reference
df <- data.frame(x = 1:10^5 + 0L)
# Normal data frame lag
sset(lag_(df), 1:10)
# Lag these behind by 3 rows
sset(lag_(df, 3, set = TRUE), 1:10)
df$x[1:10] # x variable was updated by reference!
# The above can be used naturally in data.table to lag data
# without any copies
# To perform regular R row lags, just make sure set is `FALSE`
sset(lag_(as.data.frame(EuStockMarkets), 5), 1:10)
# lag2_ is a generalised version of lag_ that allows
# for much more complex lags
x <- 1:10
# lag every 2nd element
lag2_(x, n = c(1, 0)) # lag vector is recycled
# Explicit Lag(3) using a vector of lags
lags <- lag_sequence(length(x), 3, partial = FALSE)
lag2_(x, n = lags)
# Alternating lags and leads
lag2_(x, c(1, -1))
# Lag only the 3rd element
lags <- integer(length(x))
lags[3] <- 1L
lag2_(x, lags)
# lag in descending order (same as a lead)
lag2_(x, order = 10:1)
# lag that resets after index 5
lag2_(x, run_lengths = c(5, 5))
# lag with a time index
years <- sample(2011:2020)
lag2_(x, order = order(years))
# Example of how to do a cyclical lag
n <- length(x)
# When k >= 0
k <- min(3, n)
lag2_(x, c(rep(-n + k, k), rep(k, n - k)))
# When k < 0
k <- max(-3, -n)
lag2_(x, c(rep(k, n + k), rep(n + k, -k)))
# As it turns out, we can do a grouped lag
# by supplying group sizes as run lengths and group order as the order
set.seed(45)
g <- sample(c("a", "b"), 10, TRUE)
# NOTE: collapse::flag will not work unless g is already sorted!
# This is not an issue with lag2_()
collapse::flag(x, g = g)
lag2_(x, order = order(g), run_lengths = collapse::GRP(g)$group.sizes)
# For production code, we can of course make
# this more optimised by using collapse::radixorderv()
# Which calculates the order and group sizes all at once
o <- collapse::radixorderv(g, group.sizes = TRUE)
lag2_(x, order = o, run_lengths = attr(o, "group.sizes"))
# Let's finally wrap this up in a nice grouped-lag function
grouped_lag <- function(x, n = 1, g = integer(length(x))){
o <- collapse::radixorderv(g, group.sizes = TRUE, sort = FALSE)
lag2_(x, n, order = o, run_lengths = attr(o, "group.sizes"))
}
# And voila!
grouped_lag(x, g = g)
# A method to extract this information from dplyr
## We can actually get this information easily from a `grouped_df` object
## Uncomment the below code to run the implementation
# library(dplyr)
# library(timeplyr)
# eu_stock <- EuStockMarkets |>
# ts_as_tibble() |>
# group_by(stock_index = group)
# groups <- group_data(eu_stock) # Group information
# group_order <- unlist(groups$.rows) # Order of groups
# group_sizes <- lengths_(groups$.rows) # Group sizes
#
# # by-stock index lag
# lag2_(eu_stock$value, order = group_order, run_lengths = group_sizes)
#
# # Verifying this output is correct
# eu_stock |>
# ungroup() |>
# mutate(lag1 = lag_(value), .by = stock_index) |>
# mutate(lag2 = lag2_(value, order = group_order, run_lengths = group_sizes)) |>
# summarise(lags_are_equal = identical(lag1, lag2))
# Let's compare this to data.table
library(data.table)
default_threads <- getDTthreads()
setDTthreads(1)
dt <- data.table(x = 1:10^5,
g = sample.int(10^4, 10^5, TRUE))
bench::mark(dt[, y := shift(x), by = g][][["y"]],
grouped_lag(dt$x, g = dt$g),
iterations = 10)
setDTthreads(default_threads)
Run the code above in your browser using DataLab