# NOT RUN {
# World Bank World Development Data: 216 countries, 59 years, 4 series (columns 9-12)
head(wlddev)
# Describe data
descr(wlddev)
# Panel-summarize columns 9 though 12 of this data (within and between countries)
qsu(wlddev, pid = ~ country, cols = 9:12, vlabels = TRUE)
# Do all of that by region and also compute higher moments -> returns a 4D array
qsu(wlddev, ~ region, ~ country, cols = 9:12, higher = TRUE)
# Return as nested list of statistics-matrices instead
suml <- qsu(wlddev, ~ region, ~ country,
cols = 9:12, higher = TRUE, array = FALSE)
str(suml)
# Create data.frame from this list with 3 identifier columns
unlist2d(suml, idcols = c("Variable","Trans"), row.names = "Region")
# Compute the means of all the regions and create a simpler data.frame instead
unlist2d(rapply2d(suml, fmean), idcols = c("Variable","Trans"))
# Select columns from wlddev: same as wlddev[9:12] but 2x faster and works with data.tables etc.
series <- get_vars(wlddev, 9:12)
# Replace columns, 4x faster than wlddev[9:12] <- series and also replaces names
get_vars(wlddev, 9:12) <- series
# }
# NOT RUN {
# Fast conversion to data.table with qDT, and subset rows..
library(data.table)
qDT(wlddev)[country == "Ireland"]
# }
# NOT RUN {
# Calculating fast colum-wise statistics
fNobs(series) # Number of non-missing values
fmean(series) # means of series
fmedian(series) # medians of series
fmin(series) # mins of series
# Fast grouped statistics
fNobs(series, wlddev$region) # regional number of obs
fmean(series, wlddev$region) # regional means
fmedian(series, wlddev$region) # regional medians
fsd(series, wlddev$region) # regional standard-deviations
# Means by region and income
fmean(series, get_vars(wlddev, c("region","income")))
# Same using GRP objects:
g <- GRP(wlddev, ~ region + income)
print(g)
plot(g)
# GRP objects are extremely efficient inputs to fast functions
fmean(series, g)
fmedian(series, g)
fsd(series, g)
# }
# NOT RUN {
# Faster aggregations with dplyr:
library(dplyr) # This is a lot faster than summarize_all(mean)
wlddev %>% group_by(region,income) %>% select(PCGDP,LIFEEX) %>% fmean
# }
# NOT RUN {
# Data-Apply to columns
head(dapply(series, log))
dapply(series, quantile, na.rm = TRUE)
# Data-Apply to rows (for sum use rowSums(qM(series), na.rm = TRUE), same for rowMeans ...)
head(dapply(series, max, MARGIN = 1, na.rm = TRUE))
head(dapply(mtcars, quantile, MARGIN = 1))
# qM -> quickly convert data to matrix, qDF/qDT do the reverse
fmean(rowSums(qM(series), na.rm = TRUE))
# Split-apply combine computing on columns
BY(series, wlddev$region, sum, na.rm = TRUE) # Please use: fsum(series, wlddev$region) -> faster
BY(series, wlddev$region, quantile, na.rm = TRUE)
BY(series, wlddev$region, quantile, na.rm = TRUE, expand.wide = TRUE)
# Convert panel-data to array
psar <- psmat(wlddev, ~country, ~year, cols = 9:12)
str(psar)
psar["Ireland",,] # Fast data access
psar["Ireland",,"PCGDP"]
psar[,"2016",]
qDF(psar[,"2016",], row.names.col = "Country") # Convert to data.frame
plot(psar) # Visualize
plot(psar, colour = TRUE, labs = vlabels(wlddev)[9:12])
plot(psar[c("Brazil","India","South Africa","Russian Federation","China"),,
c("PCGDP","LIFEEX","ODA")], legend = TRUE, labs = vlabels(wlddev)[c(9:10,12)])
plot(ts(psar["Brazil",,], 1960, 2018), main = "Brazil, 1960-2018")
# Aggregate this data by country and decade: Numeric columns with mean, categorical with mode
head(collap(wlddev, ~ country + decade, fmean, fmode))
# Multi-function aggregation of certain columns
head(collap(wlddev, ~ country + decade,
list(fmean, fmedian, fsd),
list(ffirst, flast), cols = c(3,9:12)))
# Customized Aggregation: Assign columns to functions
head(collap(wlddev, ~ country + decade,
custom = list(fmean = 9:10, fsd = 9:12, flast = 3, ffirst = 6:8)))
# Fast functions can also do grouped transformations:
head(fsd(series, g, TRA = "/")) # Scale series by region and income
head(fsum(series, g, TRA = "%")) # Percentages by region and income
head(fmean(series, g, TRA = "-")) # Demean / center by region and income
head(fmedian(series, g, TRA = "-")) # De-median by region and income
gmeds <- fmedian(series, g) # Same thing in 2 steps
head(TRA(series, gmeds, "-", g))
# }
# NOT RUN {
# Faster transformations with dplyr:
# (here we are demeaning PCGDP and LIFEEX using weighted means, weighted by ODA)
wlddev %>% group_by(region,income) %>% select(PCGDP,LIFEEX,ODA) %>% fmean(ODA, "-")
# }
# NOT RUN {
## But there are also specialized transformation operators for common jobs:
# Centering (Within-transforming) the 4 series by country
head(W(wlddev, ~ country, cols = 9:12))
# Same but adding overall mean back after subtracting out group means
head(W(wlddev, ~ country, cols = 9:12, mean = "overall.mean"))
# Partialling out country and year fixed effects from 2 series (qF = quick-factor)
head(HDW(wlddev, PCGDP + LIFEEX ~ qF(country) + qF(year)))
# Same, adding ODA as continuous regressor
head(HDW(wlddev, PCGDP + LIFEEX ~ qF(country) + qF(year) + ODA))
# Standardizing (scaling and centering) by country
head(STD(wlddev, ~ country, cols = 9:12))
# Computing 1 lead and 3 lags of the 4 series: Panel-computations efficient and exactly identified
head(L(wlddev, -1:3, ~ country, ~year, cols = 9:12))
# Computing the 1- and 10-year first differences of the 4 series
head(D(wlddev, c(1,10), 1, ~ country, ~year, cols = 9:12))
head(D(wlddev, c(1,10), 1:2, ~ country, ~year, cols = 9:12)) # first and second differences
head(D(wlddev, -1:1, 1, ~ country, ~year, cols = 9:12)) # 1-year lagged and leaded FD
# Computing the 1- and 10-year growth rates of the 4 series (also keeping the level series)
head(G(wlddev, c(0,1,10), 1, ~ country, ~year, cols = 9:12))
# }
# NOT RUN {
# Adding exactly identified growth rates using data.table
setDT(wlddev)[, paste0("G.", names(wlddev)[9:12]) := fgrowth(.SD,1,1,iso3c,year), .SDcols = 9:12]
# }
# NOT RUN {
# Computing the 1- and 10-year log-differences of GDP per capita and Life-Expectancy
head(G(wlddev, c(0,1,10), 1, PCGDP + LIFEEX ~ country, ~year, logdiff = TRUE))
# Same transformations using plm package:
# }
# NOT RUN {
library(plm)
pwlddev <- pdata.frame(wlddev, index = c("country","year"))
W(pwlddev$PCGDP) # Country-demeaning
W(pwlddev, cols = 9:12)
W(pwlddev$PCGDP, effect = 2) # Time-demeaning
W(pwlddev, effect = 2, cols = 9:12)
HDW(pwlddev$PCGDP) # Country- and time-demeaning
HDW(pwlddev, cols = 9:12)
STD(pwlddev$PCGDP) # Standardizing by country
STD(pwlddev, cols = 9:12)
L(pwlddev$PCGDP, -1:3) # Panel-lags
L(pwlddev, -1:3, 9:12)
G(pwlddev$PCGDP) # Panel-Growth rates
G(pwlddev, 1, 1, 9:12)
# }
Run the code above in your browser using DataLab