## World Development Panel Data
# Simple Summaries -------------------------
qsu(wlddev) # Simple summary
qsu(wlddev, vlabels = TRUE) # Display variable labels
qsu(wlddev, higher = TRUE) # Add skewness and kurtosis
# Grouped Summaries ------------------------
qsu(wlddev, ~ region, vlabels = TRUE) # Statistics by World Bank Region
qsu(wlddev, PCGDP + LIFEEX ~ income) # Summarize GDP per Capita and Life Expectancy by
stats <- qsu(wlddev, ~ region + income, # World Bank Income Level
cols = 9:10, higher = TRUE) # Same variables, by both region and income
aperm(stats) # A different perspective on the same stats
# Panel Data Summaries ---------------------
qsu(wlddev, pid = ~ iso3c, vlabels = TRUE) # Adding between and within countries statistics
# -> They show amongst other things that year and decade are individual-invariant,
# that we have GINI-data on only 161 countries, with only 8.42 observations per country on average,
# and that GDP, LIFEEX and GINI vary more between-countries, but ODA received varies more within
# countries over time.
# Let's do this manually for PCGDP:
x <- wlddev$PCGDP
g <- wlddev$iso3c
# This is the exact variance decomposion
all.equal(fvar(x), fvar(B(x, g)) + fvar(W(x, g)))
# What qsu does is calculate
r <- rbind(Overall = qsu(x),
Between = qsu(fmean(x, g)), # Aggregation instead of between-transform
Within = qsu(fwithin(x, g, mean = "overall.mean"))) # Same as qsu(W(x, g) + fmean(x))
r[3, 1] <- r[1, 1] / r[2, 1]
print.qsu(r)
# Proof:
qsu(x, pid = g)
# Using indexed data:
wldi <- findex_by(wlddev, iso3c, year) # Creating a Indexed Data Frame frame from this data
qsu(wldi) # Summary for pdata.frame -> qsu(wlddev, pid = ~ iso3c)
qsu(wldi$PCGDP) # Default summary for Panel Series
qsu(G(wldi$PCGDP)) # Summarizing GDP growth, see also ?G
# Grouped Panel Data Summaries -------------
qsu(wlddev, ~ region, ~ iso3c, cols = 9:12) # Panel-Statistics by region
psr <- qsu(wldi, ~ region, cols = 9:12) # Same on indexed data
psr # -> Gives a 4D array
psr[,"N/T",,] # Checking out the number of observations:
# In North america we only have 3 countries, for the GINI we only have 3.91 observations on average
# for 45 Sub-Saharan-African countries, etc..
psr[,"SD",,] # Considering only standard deviations
# -> In all regions variations in inequality (GINI) between countries are greater than variations
# in inequality within countries. The opposite is true for Life-Expectancy in all regions apart
# from Europe, etc..
# Again let's do this manually for PDGCP:
d <- cbind(Overall = x,
Between = fbetween(x, g),
Within = fwithin(x, g, mean = "overall.mean"))
r <- qsu(d, g = wlddev$region)
r[,"N","Between"] <- fndistinct(g[!is.na(x)], wlddev$region[!is.na(x)])
r[,"N","Within"] <- r[,"N","Overall"] / r[,"N","Between"]
r
# Proof:
qsu(wlddev, PCGDP ~ region, ~ iso3c)
# Weighted Summaries -----------------------
n <- nrow(wlddev)
weights <- abs(rnorm(n)) # Generate random weights
qsu(wlddev, w = weights, higher = TRUE) # Computed weighted mean, SD, skewness and kurtosis
weightsNA <- weights # Weights may contain missing values.. inserting 1000
weightsNA[sample.int(n, 1000)] <- NA
qsu(wlddev, w = weightsNA, higher = TRUE) # But now these values are removed from all variables
# Grouped and panel-summaries can also be weighted in the same manner
# Alternative Output Formats ---------------
# Simple case
as.data.frame(qsu(mtcars))
# For matrices can also use qDF/qDT/qTBL to assign custom name and get a character-id
qDF(qsu(mtcars), "car")
# DF from 3D array: do not combine with aperm(), might introduce wrong column labels
as.data.frame(stats, gid = "Region_Income")
# DF from 4D array: also no aperm()
as.data.frame(qsu(wlddev, ~ income, ~ iso3c, cols = 9:10), gid = "Region")
# Output as nested list
psrl <- qsu(wlddev, ~ income, ~ iso3c, cols = 9:10, array = FALSE)
psrl
# We can now use unlist2d to create a tidy data frame
unlist2d(psrl, c("Variable", "Trans"), row.names = "Income")
Run the code above in your browser using DataLab