# Example #1: Simple Data Step
df <- datastep(mtcars[1:10,],
keep = c("mpg", "cyl", "disp", "mpgcat", "recdt", "is8cyl"), {
if (mpg >= 20)
mpgcat <- "High"
else
mpgcat <- "Low"
recdt <- as.Date("1974-06-10")
if (cyl == 8)
is8cyl <- TRUE
else
is8cyl <- FALSE
})
df
# mpg cyl disp mpgcat recdt
# Mazda RX4 21.0 6 160.0 High 1974-06-10
# Mazda RX4 Wag 21.0 6 160.0 High 1974-06-10
# Datsun 710 22.8 4 108.0 High 1974-06-10
# Hornet 4 Drive 21.4 6 258.0 High 1974-06-10
# Hornet Sportabout 18.7 8 360.0 Low 1974-06-10
# Valiant 18.1 6 225.0 Low 1974-06-10
# Duster 360 14.3 8 360.0 Low 1974-06-10
# Merc 240D 24.4 4 146.7 High 1974-06-10
# Merc 230 22.8 4 140.8 High 1974-06-10
# Merc 280 19.2 6 167.6 Low 1974-06-10
# Example #2: By-group Processing
df <- datastep(mtcars[1:10,],
keep = c("mpg", "cyl", "gear", "grp"),
by = c("gear"), sort_check = FALSE, {
if (first.)
grp <- "Start"
else if (last.)
grp <- "End"
else
grp <- "-"
})
df
# mpg cyl gear grp
# Mazda RX4 21.0 6 4 Start
# Mazda RX4 Wag 21.0 6 4 -
# Datsun 710 22.8 4 4 End
# Hornet 4 Drive 21.4 6 3 Start
# Hornet Sportabout 18.7 8 3 -
# Valiant 18.1 6 3 -
# Duster 360 14.3 8 3 End
# Merc 240D 24.4 4 4 Start
# Merc 230 22.8 4 4 -
# Merc 280 19.2 6 4 End
# Example #3: Calculate Block
df <- datastep(mtcars,
keep = c("mpg", "cyl", "mean_mpg", "mpgcat"),
calculate = { mean_mpg = mean(mpg) }, {
if (mpg >= mean_mpg)
mpgcat <- "High"
else
mpgcat <- "Low"
})
df[1:10,]
# mpg cyl mean_mpg mpgcat
# Mazda RX4 21.0 6 20.09062 High
# Mazda RX4 Wag 21.0 6 20.09062 High
# Datsun 710 22.8 4 20.09062 High
# Hornet 4 Drive 21.4 6 20.09062 High
# Hornet Sportabout 18.7 8 20.09062 Low
# Valiant 18.1 6 20.09062 Low
# Duster 360 14.3 8 20.09062 Low
# Merc 240D 24.4 4 20.09062 High
# Merc 230 22.8 4 20.09062 High
# Merc 280 19.2 6 20.09062 Low
# Example #4: Data pipeline
library(dplyr)
library(magrittr)
# Add datastep to dplyr pipeline
df <- mtcars %>%
select(mpg, cyl, gear) %>%
mutate(mean_mpg = mean(mpg)) %>%
datastep({
if (mpg >= mean_mpg)
mpgcat <- "High"
else
mpgcat <- "Low"
}) %>%
filter(row_number() <= 10)
df
# mpg cyl gear mean_mpg mpgcat
# 1 21.0 6 4 20.09062 High
# 2 21.0 6 4 20.09062 High
# 3 22.8 4 4 20.09062 High
# 4 21.4 6 3 20.09062 High
# 5 18.7 8 3 20.09062 Low
# 6 18.1 6 3 20.09062 Low
# 7 14.3 8 3 20.09062 Low
# 8 24.4 4 4 20.09062 High
# 9 22.8 4 4 20.09062 High
# 10 19.2 6 4 20.09062 Low
# Example #5: Drop, Retain and Rename
df <- datastep(mtcars[1:10, ],
drop = c("disp", "hp", "drat", "qsec",
"vs", "am", "gear", "carb"),
retain = list(cumwt = 0 ),
rename = c(mpg = "MPG", cyl = "Cylinders", wt = "Wgt",
cumwt = "Cumulative Wgt"), {
cumwt <- cumwt + wt
})
df
# MPG Cylinders Wgt Cumulative Wgt
# Mazda RX4 21.0 6 2.620 2.620
# Mazda RX4 Wag 21.0 6 2.875 5.495
# Datsun 710 22.8 4 2.320 7.815
# Hornet 4 Drive 21.4 6 3.215 11.030
# Hornet Sportabout 18.7 8 3.440 14.470
# Valiant 18.1 6 3.460 17.930
# Duster 360 14.3 8 3.570 21.500
# Merc 240D 24.4 4 3.190 24.690
# Merc 230 22.8 4 3.150 27.840
# Merc 280 19.2 6 3.440 31.280
# Example #6: Attributes and Arrays
# Create sample data
dat <- read.table(header = TRUE, text = '
Year Q1 Q2 Q3 Q4
2000 125 137 152 140
2001 132 145 138 87
2002 101 104 115 121')
# Use attrib list to control column order and add labels
# Use array to calculate row sums and means, and get best quarter
df <- datastep(dat,
attrib = list(Tot = dsattr(0, label = "Year Total"),
Avg = dsattr(0, label = "Year Average"),
Best = dsattr(0, label = "Best Quarter")),
arrays = list(qtrs = dsarray("Q1", "Q2", "Q3", "Q4")),
drop = "q",
steps = {
# Empty brackets return all array values
Tot <- sum(qtrs[])
Avg <- mean(qtrs[])
# Iterate to find best quarter
for (q in qtrs) {
if (qtrs[q] == max(qtrs[]))
Best <- q
}
})
df
# Year Q1 Q2 Q3 Q4 Tot Avg Best
# 1 2000 125 137 152 140 554 138.50 Q3
# 2 2001 132 145 138 87 502 125.50 Q2
# 3 2002 101 104 115 121 441 110.25 Q4
dictionary(df)
# A tibble: 8 x 10
# Name Column Class Label Description Format Width Justify Rows NAs
#
# 1 df Year integer NA NA NA NA NA 3 0
# 2 df Q1 integer NA NA NA NA NA 3 0
# 3 df Q2 integer NA NA NA NA NA 3 0
# 4 df Q3 integer NA NA NA NA NA 3 0
# 5 df Q4 integer NA NA NA NA NA 3 0
# 6 df Tot integer Year Total NA NA NA NA 3 0
# 7 df Avg numeric Year Average NA NA NA NA 3 0
# 8 df Best character Best Quarter NA NA 2 NA 3 0
# Example #7: Set and Merge Operations
# Create sample data
grp1 <- read.table(header = TRUE, text = '
GROUP NAME
G01 Group1
G02 Group2
', stringsAsFactors = FALSE)
grp2 <- read.table(header = TRUE, text = '
GROUP NAME
G03 Group3
G04 Group4
', stringsAsFactors = FALSE)
dat <- read.table(header = TRUE, text = '
ID AGE SEX GROUP
A01 58 F G01
A02 20 M G02
A03 47 F G05
A04 11 M G03
A05 23 F G01
', stringsAsFactors = FALSE)
# Set operation
grps <- datastep(grp1, set = grp2, {})
grps
# GROUP NAME
# 1 G01 Group1
# 2 G02 Group2
# 3 G03 Group3
# 4 G04 Group4
# Merge operation - Outer Join
res <- datastep(dat, merge = grps,
merge_by = "GROUP",
merge_in = c("inA", "inB"), {})
# View results
res
# ID AGE SEX GROUP NAME inA inB
# 1 A01 58 F G01 Group1 1 1
# 2 A05 23 F G01 Group1 1 1
# 3 A02 20 M G02 Group2 1 1
# 4 A04 11 M G03 Group3 1 1
# 5 A03 47 F G05 1 0
# 6 NA G04 Group4 0 1
# Merge operation - Inner Join
res <- datastep(dat, merge = grps,
merge_by = "GROUP",
merge_in = c("inA", "inB"),
where = expression(inA & inB), {})
# View results
res
# ID AGE SEX GROUP NAME inA inB
# 1 A01 58 F G01 Group1 1 1
# 2 A05 23 F G01 Group1 1 1
# 3 A02 20 M G02 Group2 1 1
# 4 A04 11 M G03 Group3 1 1
# Example #8: Data NULL
# Create new dataset using output() functions.
res <- datastep(NULL,
{
ID <- 10
Item <- "Pencil"
output()
ID <- 20
Item <- "Scissors"
output()
})
# View results
res
# ID Item
# 1 10 Pencil
# 2 20 Scissors
Run the code above in your browser using DataLab