# NOT RUN {
# Dataframe used throughout this doc
df <- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE))
# }
# NOT RUN {
# }
# NOT RUN {
head(select(df, ascii(df$Class), ascii(df$Sex)))
# }
# NOT RUN {
# }
# NOT RUN {
tmp <- mutate(df, s1 = encode(df$Class, "UTF-8"))
str(tmp)
tmp2 <- mutate(tmp, s2 = base64(tmp$s1), s3 = decode(tmp$s1, "UTF-8"),
s4 = soundex(tmp$Sex))
head(tmp2)
head(select(tmp2, unbase64(tmp2$s2)))
# }
# NOT RUN {
# }
# NOT RUN {
tmp <- mutate(df, sex_lower = lower(df$Sex), age_upper = upper(df$age),
sex_age = concat_ws(" ", lower(df$sex), lower(df$age)))
head(tmp)
tmp2 <- mutate(tmp, s1 = initcap(tmp$sex_lower), s2 = initcap(tmp$sex_age),
s3 = reverse(df$Sex))
head(tmp2)
# }
# NOT RUN {
# }
# NOT RUN {
tmp <- mutate(df, SexLpad = lpad(df$Sex, 6, " "), SexRpad = rpad(df$Sex, 7, " "))
head(select(tmp, length(tmp$Sex), length(tmp$SexLpad), length(tmp$SexRpad)))
tmp2 <- mutate(tmp, SexLtrim = ltrim(tmp$SexLpad), SexRtrim = rtrim(tmp$SexRpad),
SexTrim = trim(tmp$SexLpad))
head(select(tmp2, length(tmp2$Sex), length(tmp2$SexLtrim),
length(tmp2$SexRtrim), length(tmp2$SexTrim)))
tmp <- mutate(df, SexLpad = lpad(df$Sex, 6, "xx"), SexRpad = rpad(df$Sex, 7, "xx"))
head(tmp)
# }
# NOT RUN {
# }
# NOT RUN {
tmp <- mutate(df, d1 = levenshtein(df$Class, df$Sex),
d2 = levenshtein(df$Age, df$Sex),
d3 = levenshtein(df$Age, df$Age))
head(tmp)
# }
# NOT RUN {
# }
# NOT RUN {
tmp <- mutate(df, s1 = instr(df$Sex, "m"), s2 = instr(df$Sex, "M"),
s3 = locate("m", df$Sex), s4 = locate("m", df$Sex, pos = 4))
head(tmp)
# }
# NOT RUN {
# }
# NOT RUN {
tmp <- mutate(df, v1 = df$Freq/3)
head(select(tmp, format_number(tmp$v1, 0), format_number(tmp$v1, 2),
format_string("%4.2f %s", tmp$v1, tmp$Sex)), 10)
# }
# NOT RUN {
# }
# NOT RUN {
# concatenate strings
tmp <- mutate(df, s1 = concat_ws("_", df$Class, df$Sex),
s2 = concat_ws("+", df$Class, df$Sex, df$Age, df$Survived))
head(tmp)
# }
# NOT RUN {
# }
# NOT RUN {
tmp <- mutate(df, s1 = regexp_extract(df$Class, "(\\d+)\\w+", 1),
s2 = regexp_extract(df$Sex, "^(\\w)\\w+", 1),
s3 = regexp_replace(df$Class, "\\D+", ""),
s4 = substring_index(df$Sex, "a", 1),
s5 = substring_index(df$Sex, "a", -1),
s6 = translate(df$Sex, "ale", ""),
s7 = translate(df$Sex, "a", "-"))
head(tmp)
# }
# NOT RUN {
# }
# NOT RUN {
head(select(df, split_string(df$Class, "\\d", 2)))
head(select(df, split_string(df$Sex, "a")))
head(select(df, split_string(df$Class, "\\d")))
# This is equivalent to the following SQL expression
head(selectExpr(df, "split(Class, '\\\\d')"))
# }
# NOT RUN {
# }
# NOT RUN {
head(select(df, repeat_string(df$Class, 3)))
# This is equivalent to the following SQL expression
head(selectExpr(df, "repeat(Class, 3)"))
# }
Run the code above in your browser using DataLab