library(dplyr)
# Assuming 'df' is the dataframe you want to process
df <- tibble::tibble(
Sepal_Length = c(5.2, 5, 5.7, NA, 6.2, 6.7, 5.5),
Petal_Length = c(1.5, 1.4, 4.2, 1.4, NA, 5.8, 3.7),
Petal_Width = c(NA, 0.2, 1.2, 0.2, 1.3, 1.8, NA),
Species = c("setosa", NA, "versicolor", "setosa",
NA, "virginica", "setosa")
)
# If you do not specify `selected_variables` (i.e., leave it as `NULL`),
# the function will impute missing values for all columns in the dataframe.
result_df_mean <- fill_missing_values(df, method = "mean")
result_df_mean
# If you specify column names, only those columns will be imputed. For
# example, impute for variables `Petal_Length` and `Petal_Width` using
# the geometric mean.
result_df_geomean <- fill_missing_values(df, selected_variables = c
("Petal_Length", "Petal_Width"), method = "geometric")
result_df_geomean
# If you specify column positions, only the columns at those positions will #' # be imputed.
result_df_max <- fill_missing_values(df, selected_variables = c
(2, 3), method = "max")
result_df_max
# Impute missing values (NAs) in a grouped data frame
# You can do that by using the following:
sample_iris <- tibble::tibble(
Sepal_Length = c(5.2, 5, 5.7, NA, 6.2, 6.7, 5.5),
Petal_Length = c(1.5, 1.4, 4.2, 1.4, NA, 5.8, 3.7),
Petal_Width = c(0.3, 0.2, 1.2, 0.2, 1.3, 1.8, NA),
Species = c("setosa", "setosa", "versicolor", "setosa",
"virginica", "virginica", "setosa")
)
sample_iris %>%
group_by(Species) %>%
group_split() %>%
map_df(fill_missing_values, method = "median")
Run the code above in your browser using DataLab