# NOT RUN {
# World Bank World Development Data: 216 countries, 59 years, 4 series (columns 9-12)
head(wlddev)
# Describe data
descr(wlddev)
# Pairwise correlations with p-value
pwcor(num_vars(wlddev), P = TRUE)
# Panel-summarize columns 9 though 12 of this data (within and between countries)
qsu(wlddev, pid = ~ country, cols = 9:12, vlabels = TRUE)
# Do all of that by region and also compute higher moments -> returns a 4D array
qsu(wlddev, ~ region, ~ country, cols = 9:12, higher = TRUE)
# Return as nested list of statistics-matrices instead
suml <- qsu(wlddev, ~ region, ~ country,
cols = 9:12, higher = TRUE, array = FALSE)
str(suml)
# Create data.frame from this list with 3 identifier columns
head(unlist2d(suml, idcols = c("Variable","Trans"), row.names = "Region"))
# }
# NOT RUN {
<!-- % # Compute the means of all the regions and create a simpler data.frame instead % unlist2d(rapply2d(suml, fmean), idcols = c("Variable","Trans")) -->
# }
# NOT RUN {
# Select columns from wlddev
series <- get_vars(wlddev, 9:12) # same as wlddev[9:12] but 2x faster and works with data.tables
series <- fselect(wlddev, PCGDP:ODA) # Same thing: > 100x faster t. dplyr::select(wlddev, PCGDP:ODA)
# Replace columns, 8x faster than wlddev[9:12] <- series and also replaces names
get_vars(wlddev, 9:12) <- series
# Fast subsetting
head(fsubset(wlddev, country == "Ireland", -country, -iso3c))
head(fsubset(wlddev, country == "Ireland" & year > 1990, year, PCGDP:ODA))
ss(wlddev, 1:10, 1:10) # This is an order of magnitude faster than wlddev[1:10, 1:10]
# Fast transforming
head(ftransform(wlddev, ODA_GDP = ODA / PCGDP, ODA_LIFEEX = sqrt(ODA) / LIFEEX))
head(ftransform(wlddev, ODA_GDP = ODA / PCGDP, PCGDP = NULL, ODA = NULL, GINI_sum = fsum(GINI)))
# Calculating fast colum-wise statistics
fNobs(series) # Number of non-missing values
fmean(series) # means of series
fmedian(series) # medians of series
fmin(series) # mins of series
# Fast grouped statistics
fNobs(series, wlddev$region) # regional number of obs
fmean(series, wlddev$region) # regional means
fmedian(series, wlddev$region) # regional medians
fsd(series, wlddev$region) # regional standard-deviations
# Means by region and income
fmean(series, fselect(wlddev, region, income))
# Same using GRP objects:
g <- GRP(wlddev, ~ region + income)
print(g)
plot(g)
# GRP objects are extremely efficient inputs to fast functions
fmean(series, g)
fmedian(series, g)
fsd(series, g)
# Another option is creating a grouped_df, using dplyr::group_by or the faster fgroup_by
gseries <- fgroup_by(fselect(wlddev, region, income, PCGDP:ODA), region, income)
str(gseries)
fmean(gseries) # grouped mean
fmean(gseries, w = ODA) # weighted grouped mean, weighted by ODA
fsd(gseries, w = ODA) # Weighted group standard deviation
# }
# NOT RUN {
# Faster aggregations with dplyr:
library(dplyr) # This is already a lot faster than summarize_all(mean)
wlddev %>% group_by(region,income) %>% select(PCGDP,LIFEEX) %>% fmean
# Now this is getting fast, apart from the pipe which still slows things down...
wlddev %>% fgroup_by(region,income) %>% fselect(PCGDP,LIFEEX) %>% fmean
# }
# NOT RUN {
# Data-Apply to columns
head(dapply(series, log))
dapply(series, quantile, na.rm = TRUE)
# Data-Apply to rows (for sum use rowSums(qM(series), na.rm = TRUE), same for rowMeans ...)
head(dapply(mtcars, max, MARGIN = 1, na.rm = TRUE))
head(dapply(mtcars, quantile, MARGIN = 1))
# qM -> quickly convert data to matrix, qDF/qDT do the reverse
fmean(rowSums(qM(series), na.rm = TRUE))
# Split-apply combine computing on columns
BY(series, wlddev$region, sum, na.rm = TRUE) # Please use: fsum(series, wlddev$region) -> faster
BY(series, wlddev$region, quantile, na.rm = TRUE)
BY(series, wlddev$region, quantile, na.rm = TRUE, expand.wide = TRUE)
# Convert panel-data to array
psar <- psmat(wlddev, ~country, ~year, cols = 9:12)
str(psar)
psar["Ireland",,] # Fast data access
psar["Ireland",,"PCGDP"]
psar[,"2016",]
qDF(psar[,"2016",], row.names.col = "Country") # Convert to data.frame
plot(psar, colour = TRUE, labs = vlabels(wlddev)[9:12]) # Visualize
plot(psar[c("Brazil","India","South Africa","Russian Federation","China"),,
c("PCGDP","LIFEEX","ODA")], legend = TRUE, labs = vlabels(wlddev)[c(9:10,12)])
plot(ts(psar["Brazil",,], 1960, 2018), main = "Brazil, 1960-2018")
# Aggregate this data by country and decade: Numeric columns with mean, categorical with mode
head(collap(wlddev, ~ country + decade, fmean, fmode))
# Multi-function aggregation of certain columns
head(collap(wlddev, ~ country + decade,
list(fmean, fmedian, fsd),
list(ffirst, flast), cols = c(3,9:12)))
# Customized Aggregation: Assign columns to functions
head(collap(wlddev, ~ country + decade,
custom = list(fmean = 9:10, fsd = 9:12, flast = 3, ffirst = 6:8)))
# Fast functions can also do grouped transformations:
head(fsd(series, g, TRA = "/")) # Scale series by region and income
head(fsum(series, g, TRA = "%")) # Percentages by region and income
head(fmean(series, g, TRA = "-")) # Demean / center by region and income
head(fmedian(series, g, TRA = "-")) # De-median by region and income
gmeds <- fmedian(series, g) # Same thing in 2 steps
head(TRA(series, gmeds, "-", g))
# }
# NOT RUN {
# Faster transformations with dplyr:
wlddev %>% fgroup_by(region,income) %>% fselect(PCGDP,LIFEEX,ODA) %>%
fwithin(ODA) # Centering using weighted means, weighted by ODA
# }
# NOT RUN {
## But there are also tidy transformation operators for common jobs:
# Centering (within-transforming) the 4 series by country
head(W(wlddev, ~ country, cols = 9:12))
# Same but adding overall mean back after subtracting out group means
head(W(wlddev, ~ country, cols = 9:12, mean = "overall.mean"))
# Partialling out country and year fixed effects from 2 series (qF = quick-factor)
head(HDW(wlddev, PCGDP + LIFEEX ~ qF(country) + qF(year)))
# Same, adding ODA as continuous regressor
head(HDW(wlddev, PCGDP + LIFEEX ~ qF(country) + qF(year) + ODA))
# Standardizing (scaling and centering) by country
head(STD(wlddev, ~ country, cols = 9:12))
# Computing 1 lead and 3 lags of the 4 series: Panel-computations efficient and exactly identified
head(L(wlddev, -1:3, ~ country, ~year, cols = 9:12))
# Computing the 1- and 10-year first differences of the 4 series
head(D(wlddev, c(1,10), 1, ~ country, ~year, cols = 9:12))
head(D(wlddev, c(1,10), 1:2, ~ country, ~year, cols = 9:12)) # first and second differences
head(D(wlddev, -1:1, 1, ~ country, ~year, cols = 9:12)) # 1-year lagged and leaded FD
# Computing the 1- and 10-year growth rates of the 4 series (also keeping the level series)
head(G(wlddev, c(0,1,10), 1, ~ country, ~year, cols = 9:12))
# }
# NOT RUN {
# Adding exactly identified growth rates using data.table
library(data.table)
setDT(wlddev)[, paste0("G.", names(wlddev)[9:12]) := fgrowth(.SD,1,1,iso3c,year), .SDcols = 9:12]
# }
# NOT RUN {
# Deleting again and doing the dame thing with add_vars
get_vars(wlddev, "G1.", regex = TRUE) <- NULL
add_vars(wlddev) <- fgrowth(gv(wlddev, 9:12), 1, 1, wlddev$iso3c, wlddev$year)
get_vars(wlddev, "G1.", regex = TRUE) <- NULL
# Computing the 1- and 10-year log-differences of GDP per capita and Life-Expectancy
head(G(wlddev, c(0,1,10), 1, PCGDP + LIFEEX ~ country, ~year, logdiff = TRUE))
# Same transformations using plm package:
# }
# NOT RUN {
library(plm)
pwlddev <- pdata.frame(wlddev, index = c("country","year"))
head(W(pwlddev$PCGDP)) # Country-demeaning
head(W(pwlddev, cols = 9:12))
head(W(pwlddev$PCGDP, effect = 2)) # Time-demeaning
head(W(pwlddev, effect = 2, cols = 9:12))
head(HDW(pwlddev$PCGDP)) # Country- and time-demeaning
head(HDW(pwlddev, cols = 9:12))
head(STD(pwlddev$PCGDP)) # Standardizing by country
head(STD(pwlddev, cols = 9:12))
head(L(pwlddev$PCGDP, -1:3)) # Panel-lags
head(L(pwlddev, -1:3, 9:12))
head(G(pwlddev$PCGDP)) # Panel-Growth rates
head(G(pwlddev, 1, 1, 9:12))
# }
Run the code above in your browser using DataLab