# NOT RUN {
# Dataframe used throughout this doc
df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
# }
# NOT RUN {
# }
# NOT RUN {
head(select(df, approxCountDistinct(df$gear)))
head(select(df, approxCountDistinct(df$gear, 0.02)))
head(select(df, countDistinct(df$gear, df$cyl)))
head(select(df, n_distinct(df$gear)))
head(distinct(select(df, "gear")))
# }
# NOT RUN {
# }
# NOT RUN {
head(select(df, mean(df$mpg), sd(df$mpg), skewness(df$mpg), kurtosis(df$mpg)))
# }
# NOT RUN {
# }
# NOT RUN {
head(select(df, avg(df$mpg), mean(df$mpg), sum(df$mpg), min(df$wt), max(df$qsec)))
# metrics by num of cylinders
tmp <- agg(groupBy(df, "cyl"), avg(df$mpg), avg(df$hp), avg(df$wt), avg(df$qsec))
head(orderBy(tmp, "cyl"))
# car with the max mpg
mpg_max <- as.numeric(collect(agg(df, max(df$mpg))))
head(where(df, df$mpg == mpg_max))
# }
# NOT RUN {
# }
# NOT RUN {
head(select(df, sd(df$mpg), stddev(df$mpg), stddev_pop(df$wt), stddev_samp(df$qsec)))
# }
# NOT RUN {
# }
# NOT RUN {
head(select(df, sumDistinct(df$gear)))
head(distinct(select(df, "gear")))
# }
# NOT RUN {
# }
# NOT RUN {
head(agg(df, var(df$mpg), variance(df$mpg), var_pop(df$mpg), var_samp(df$mpg)))
# }
# NOT RUN {
# }
# NOT RUN {
df2 = df[df$mpg > 20, ]
collect(select(df2, collect_list(df2$gear)))
collect(select(df2, collect_set(df2$gear)))
# }
# NOT RUN {
# }
# NOT RUN {
# With cube
agg(
cube(df, "cyl", "gear", "am"),
mean(df$mpg),
grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am)
)
# With rollup
agg(
rollup(df, "cyl", "gear", "am"),
mean(df$mpg),
grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am)
)
# }
# NOT RUN {
# }
# NOT RUN {
# With cube
agg(
cube(df, "cyl", "gear", "am"),
mean(df$mpg),
grouping_id(df$cyl, df$gear, df$am)
)
# With rollup
agg(
rollup(df, "cyl", "gear", "am"),
mean(df$mpg),
grouping_id(df$cyl, df$gear, df$am)
)
# }
Run the code above in your browser using DataLab