column_collection_functions: Collection functions for Column operations

Description

Collection functions defined for Column.

Usage

array_aggregate(x, initialValue, merge, ...)
array_contains(x, value)
array_distinct(x)
array_except(x, y)
array_exists(x, f)
array_forall(x, f)
array_filter(x, f)
array_intersect(x, y)
array_join(x, delimiter, ...)
array_max(x)
array_min(x)
array_position(x, value)
array_remove(x, value)
array_repeat(x, count)
array_sort(x)
array_transform(x, f)
arrays_overlap(x, y)
array_union(x, y)
arrays_zip(x, ...)
arrays_zip_with(x, y, f)
concat(x, ...)
element_at(x, extraction)
explode(x)
explode_outer(x)
flatten(x)
from_json(x, schema, ...)
from_csv(x, schema, ...)
map_concat(x, ...)
map_entries(x)
map_filter(x, f)
map_from_arrays(x, y)
map_from_entries(x)
map_keys(x)
map_values(x)
map_zip_with(x, y, f)
posexplode(x)
posexplode_outer(x)
reverse(x)
schema_of_csv(x, ...)
schema_of_json(x, ...)
shuffle(x)
size(x)
slice(x, start, length)
sort_array(x, asc = TRUE)
transform_keys(x, f)
transform_values(x, f)
to_json(x, ...)
to_csv(x, ...)
# S4 method for Column
reverse(x)
# S4 method for Column
to_json(x, ...)
# S4 method for Column
to_csv(x, ...)
# S4 method for Column
concat(x, ...)
# S4 method for Column,characterOrstructTypeOrColumn
from_json(x, schema, as.json.array = FALSE, ...)
# S4 method for characterOrColumn
schema_of_json(x, ...)
# S4 method for Column,characterOrstructTypeOrColumn
from_csv(x, schema, ...)
# S4 method for characterOrColumn
schema_of_csv(x, ...)
# S4 method for characterOrColumn,Column,`function`
array_aggregate(x, initialValue, merge, finish = NULL)
# S4 method for Column
array_contains(x, value)
# S4 method for Column
array_distinct(x)
# S4 method for Column,Column
array_except(x, y)
# S4 method for characterOrColumn,`function`
array_exists(x, f)
# S4 method for characterOrColumn,`function`
array_filter(x, f)
# S4 method for characterOrColumn,`function`
array_forall(x, f)
# S4 method for Column,Column
array_intersect(x, y)
# S4 method for Column,character
array_join(x, delimiter, nullReplacement = NULL)
# S4 method for Column
array_max(x)
# S4 method for Column
array_min(x)
# S4 method for Column
array_position(x, value)
# S4 method for Column
array_remove(x, value)
# S4 method for Column,numericOrColumn
array_repeat(x, count)
# S4 method for Column
array_sort(x)
# S4 method for characterOrColumn,`function`
array_transform(x, f)
# S4 method for Column,Column
arrays_overlap(x, y)
# S4 method for Column,Column
array_union(x, y)
# S4 method for Column
arrays_zip(x, ...)
# S4 method for characterOrColumn,characterOrColumn,`function`
arrays_zip_with(x, y, f)
# S4 method for Column
shuffle(x)
# S4 method for Column
flatten(x)
# S4 method for Column
map_concat(x, ...)
# S4 method for Column
map_entries(x)
# S4 method for characterOrColumn,`function`
map_filter(x, f)
# S4 method for Column,Column
map_from_arrays(x, y)
# S4 method for Column
map_from_entries(x)
# S4 method for Column
map_keys(x)
# S4 method for characterOrColumn,`function`
transform_keys(x, f)
# S4 method for characterOrColumn,`function`
transform_values(x, f)
# S4 method for Column
map_values(x)
# S4 method for characterOrColumn,characterOrColumn,`function`
map_zip_with(x, y, f)
# S4 method for Column
element_at(x, extraction)
# S4 method for Column
explode(x)
# S4 method for Column
size(x)
# S4 method for Column
slice(x, start, length)
# S4 method for Column
sort_array(x, asc = TRUE)
# S4 method for Column
posexplode(x)
# S4 method for Column
explode_outer(x)
# S4 method for Column
posexplode_outer(x)

Arguments

Column to compute on. Note the difference in the following methods:

to_json: it is the column containing the struct, array of the structs, the map or array of maps.
to_csv: it is the column containing the struct.
from_json: it is the column containing the JSON string.
from_csv: it is the column containing the CSV string.

initialValue

a Column used as the initial value in array_aggregate

merge

a function a binary function (Column, Column) -> Column used in array_aggregateto merge values (the second argument) into accumulator (the first argument).

...

additional argument(s).

to_json, from_json and schema_of_json: this contains additional named properties to control how it is converted and accepts the same options as the JSON data source.
to_json: it supports the "pretty" option which enables pretty JSON generation.
to_csv, from_csv and schema_of_csv: this contains additional named properties to control how it is converted and accepts the same options as the CSV data source.
arrays_zip, this contains additional Columns of arrays to be merged.
map_concat, this contains additional Columns of maps to be unioned.

value

A value to compute on.

array_contains: a value to be checked if contained in the column.
array_position: a value to locate in the given array.
array_remove: a value to remove in the given array.

Column to compute on.

a function mapping from Column(s) to Column.

array_exists
array_filter the Boolean function used to filter the data. Either unary or binary. In the latter case the second argument is the index in the array (0-based).
array_forall the Boolean unary function used to filter the data.
array_transform a function used to transform the data. Either unary or binary. In the latter case the second argument is the index in the array (0-based).
arrays_zip_with
map_zip_with
map_filter the Boolean binary function used to filter the data. The first argument is the key, the second argument is the value.
transform_keys a binary function used to transform the data. The first argument is the key, the second argument is the value.
transform_values a binary function used to transform the data. The first argument is the key, the second argument is the value.

delimiter

a character string that is used to concatenate the elements of column.

count

a Column or constant determining the number of repetitions.

extraction

index to check for in array or key to check for in map

schema

from_json: a structType object to use as the schema to use when parsing the JSON string. Since Spark 2.3, the DDL-formatted string is also supported for the schema. Since Spark 3.0, schema_of_json or the DDL-formatted string literal can also be accepted.
from_csv: a structType object, DDL-formatted string or schema_of_csv

start

the starting index

length

the length of the slice

asc

a logical flag indicating the sorting order. TRUE, sorting is in ascending order. FALSE, sorting is in descending order.

as.json.array

indicating if input string is JSON array of objects or a single object.

finish

an unary function (Column) -> Column used to apply final transformation on the accumulated data in array_aggregate.

nullReplacement

an optional character string that is used to replace the Null values.

Details

reverse: Returns a reversed string or an array with reverse order of elements.

to_json: Converts a column containing a structType, a mapType or an arrayType into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered.

to_csv: Converts a column containing a structType into a Column of CSV string. Resolving the Column can fail if an unsupported type is encountered.

concat: Concatenates multiple input columns together into a single column. The function works with strings, binary and compatible array columns.

from_json: Parses a column containing a JSON string into a Column of structType with the specified schema or array of structType if as.json.array is set to TRUE. If the string is unparseable, the Column will contain the value NA.

schema_of_json: Parses a JSON string and infers its schema in DDL format.

from_csv: Parses a column containing a CSV string into a Column of structType with the specified schema. If the string is unparseable, the Column will contain the value NA.

schema_of_csv: Parses a CSV string and infers its schema in DDL format.

array_aggregate Applies a binary operator to an initial state and all elements in the array, and reduces this to a single state. The final state is converted into the final result by applying a finish function.

array_contains: Returns null if the array is null, true if the array contains the value, and false otherwise.

array_distinct: Removes duplicate values from the array.

array_except: Returns an array of the elements in the first array but not in the second array, without duplicates. The order of elements in the result is not determined.

array_exists Returns whether a predicate holds for one or more elements in the array.

array_filter Returns an array of elements for which a predicate holds in a given array.

array_forall Returns whether a predicate holds for every element in the array.

array_intersect: Returns an array of the elements in the intersection of the given two arrays, without duplicates.

array_join: Concatenates the elements of column using the delimiter. Null values are replaced with nullReplacement if set, otherwise they are ignored.

array_max: Returns the maximum value of the array.

array_min: Returns the minimum value of the array.

array_position: Locates the position of the first occurrence of the given value in the given array. Returns NA if either of the arguments are NA. Note: The position is not zero based, but 1 based index. Returns 0 if the given value could not be found in the array.

array_remove: Removes all elements that equal to element from the given array.

array_repeat: Creates an array containing x repeated the number of times given by count.

array_sort: Sorts the input array in ascending order. The elements of the input array must be orderable. NA elements will be placed at the end of the returned array.

array_transform Returns an array of elements after applying a transformation to each element in the input array.

arrays_overlap: Returns true if the input arrays have at least one non-null element in common. If not and both arrays are non-empty and any of them contains a null, it returns null. It returns false otherwise.

array_union: Returns an array of the elements in the union of the given two arrays, without duplicates.

arrays_zip: Returns a merged array of structs in which the N-th struct contains all N-th values of input arrays.

arrays_zip_with Merge two given arrays, element-wise, into a single array using a function. If one array is shorter, nulls are appended at the end to match the length of the longer array, before applying the function.

shuffle: Returns a random permutation of the given array.

flatten: Creates a single array from an array of arrays. If a structure of nested arrays is deeper than two levels, only one level of nesting is removed.

map_concat: Returns the union of all the given maps.

map_entries: Returns an unordered array of all entries in the given map.

map_filter Returns a map whose key-value pairs satisfy a predicate.

map_from_arrays: Creates a new map column. The array in the first column is used for keys. The array in the second column is used for values. All elements in the array for key should not be null.

map_from_entries: Returns a map created from the given array of entries.

map_keys: Returns an unordered array containing the keys of the map.

transform_keys Applies a function to every key-value pair in a map and returns a map with the results of those applications as the new keys for the pairs.

transform_values Applies a function to every key-value pair in a map and returns a map with the results of those applications as the new values for the pairs.

map_values: Returns an unordered array containing the values of the map.

map_zip Merge two given maps, key-wise into a single map using a function.

element_at: Returns element of array at given index in extraction if x is array. Returns value for the given key in extraction if x is map. Note: The position is not zero based, but 1 based index.

explode: Creates a new row for each element in the given array or map column. Uses the default column name col for elements in the array and key and value for elements in the map unless specified otherwise.

size: Returns length of array or map.

slice: Returns an array containing all the elements in x from the index start (array indices start at 1, or from the end if start is negative) with the specified length.

sort_array: Sorts the input array in ascending or descending order according to the natural ordering of the array elements. NA elements will be placed at the beginning of the returned array in ascending order or at the end of the returned array in descending order.

posexplode: Creates a new row for each element with position in the given array or map column. Uses the default column name pos for position, and col for elements in the array and key and value for elements in the map unless specified otherwise.

explode: Creates a new row for each element in the given array or map column. Unlike explode, if the array/map is null or empty then null is produced. Uses the default column name col for elements in the array and key and value for elements in the map unless specified otherwise.

posexplode_outer: Creates a new row for each element with position in the given array or map column. Unlike posexplode, if the array/map is null or empty then the row (null, null) is produced. Uses the default column name pos for position, and col for elements in the array and key and value for elements in the map unless specified otherwise.

Examples

Run this code

# NOT RUN {
# Dataframe used throughout this doc
df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1), shuffle(tmp$v1)))
head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1)))
head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1)))
head(select(tmp, reverse(tmp$v1), array_remove(tmp$v1, 21)))
head(select(tmp, array_transform("v1", function(x) x * 10)))
head(select(tmp, array_exists("v1", function(x) x > 120)))
head(select(tmp, array_forall("v1", function(x) x >= 8.0)))
head(select(tmp, array_filter("v1", function(x) x < 10)))
head(select(tmp, array_aggregate("v1", lit(0), function(acc, y) acc + y)))
head(select(
  tmp,
  array_aggregate("v1", lit(0), function(acc, y) acc + y, function(acc) acc / 10)))
tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
head(tmp2)
head(select(tmp, posexplode(tmp$v1)))
head(select(tmp, slice(tmp$v1, 2L, 2L)))
head(select(tmp, sort_array(tmp$v1)))
head(select(tmp, sort_array(tmp$v1, asc = FALSE)))
tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl))
head(select(tmp3, map_entries(tmp3$v3), map_keys(tmp3$v3), map_values(tmp3$v3)))
head(select(tmp3, element_at(tmp3$v3, "Valiant"), map_concat(tmp3$v3, tmp3$v3)))
head(select(tmp3, transform_keys("v3", function(k, v) upper(k))))
head(select(tmp3, transform_values("v3", function(k, v) v * 10)))
head(select(tmp3, map_filter("v3", function(k, v) v < 42)))
tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
head(select(tmp4, array_except(tmp4$v4, tmp4$v5), array_intersect(tmp4$v4, tmp4$v5)))
head(select(tmp4, array_union(tmp4$v4, tmp4$v5)))
head(select(tmp4, arrays_zip(tmp4$v4, tmp4$v5)))
head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
head(select(tmp4, arrays_zip_with(tmp4$v4, tmp4$v5, function(x, y) x * y)))
tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))
tmp6 <- mutate(df, v7 = create_array(create_array(df$model, df$model)))
head(select(tmp6, flatten(tmp6$v7)))
tmp7 <- mutate(df, v8 = create_array(df$model, df$cyl), v9 = create_array(df$model, df$hp))
head(select(tmp7, arrays_zip_with("v8", "v9", function(x, y) (x * y) %% 3)))
head(select(tmp7, map_from_arrays(tmp7$v8, tmp7$v9)))
tmp8 <- mutate(df, v10 = create_array(struct(df$model, df$cyl)))
head(select(tmp8, map_from_entries(tmp8$v10)))
# }
# NOT RUN {
# }
# NOT RUN {
# Converts a struct into a JSON object
df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
select(df2, to_json(df2$d, dateFormat = 'dd/MM/yyyy'))

# Converts an array of structs into a JSON array
df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))

# Converts a map into a JSON object
df2 <- sql("SELECT map('name', 'Bob') as people")
df2 <- mutate(df2, people_json = to_json(df2$people))

# Converts an array of maps into a JSON array
df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))

# Converts a map into a pretty JSON object
df2 <- sql("SELECT map('name', 'Bob') as people")
df2 <- mutate(df2, people_json = to_json(df2$people, pretty = TRUE))
# }
# NOT RUN {
# }
# NOT RUN {
# Converts a struct into a CSV string
df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
select(df2, to_csv(df2$d, dateFormat = 'dd/MM/yyyy'))
# }
# NOT RUN {
# }
# NOT RUN {
df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
df2 <- mutate(df2, d2 = to_json(df2$d, dateFormat = 'dd/MM/yyyy'))
schema <- structType(structField("date", "string"))
head(select(df2, from_json(df2$d2, schema, dateFormat = 'dd/MM/yyyy')))
df2 <- sql("SELECT named_struct('name', 'Bob') as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
schema <- structType(structField("name", "string"))
head(select(df2, from_json(df2$people_json, schema)))
head(select(df2, from_json(df2$people_json, "name STRING")))
head(select(df2, from_json(df2$people_json, schema_of_json(head(df2)$people_json))))
# }
# NOT RUN {
# }
# NOT RUN {
json <- "{\"name\":\"Bob\"}"
df <- sql("SELECT * FROM range(1)")
head(select(df, schema_of_json(json)))
# }
# NOT RUN {
# }
# NOT RUN {
csv <- "Amsterdam,2018"
df <- sql(paste0("SELECT '", csv, "' as csv"))
schema <- "city STRING, year INT"
head(select(df, from_csv(df$csv, schema)))
head(select(df, from_csv(df$csv, structType(schema))))
head(select(df, from_csv(df$csv, schema_of_csv(csv))))
# }
# NOT RUN {
# }
# NOT RUN {
csv <- "Amsterdam,2018"
df <- sql("SELECT * FROM range(1)")
head(select(df, schema_of_csv(csv)))
# }
# NOT RUN {
# }
# NOT RUN {
df2 <- createDataFrame(data.frame(
  id = c(1, 2, 3), text = c("a,b,c", NA, "d,e")
))

head(select(df2, df2$id, explode_outer(split_string(df2$text, ","))))
head(select(df2, df2$id, posexplode_outer(split_string(df2$text, ","))))
# }

Run the code above in your browser using DataLab