# NOT RUN {
# Creating data frame with outliers
set.seed(10)
df=data.frame(var1=rchisq(1000,df = 1), var2=rnorm(1000))
df=rbind(df, 1135, 2432) # forcing outliers
df$id=as.character(seq(1:1002))
# for var1: mean is ~ 4.56, and max 2432
summary(df)
########################################################
### PREPARING OUTLIERS FOR DESCRIPTIVE STATISTICS
########################################################
#### EXAMPLE 1: Removing top 1%% for a single variable
# checking the value for the top 1% of highest values (percentile 0.99), which is ~ 7.05
quantile(df$var1, 0.99)
# Setting type='set_na' sets NA to the highest value specified by top_percent.
# In this case 'data' parameter is single vector, thus it returns a single vector as well.
var1_treated=prep_outliers(data = df$var1, type='set_na', top_percent = 0.01,method = "bottom_top")
# now the mean (~ 1) is more accurate, and note that: 1st, median and 3rd
# quartiles remaining very similar to the original variable.
summary(var1_treated)
#### EXAMPLE 2: Removing top and bottom 1% for the specified input variables.
vars_to_process=c('var1', 'var2')
df_treated3=prep_outliers(data = df, input = vars_to_process, type='set_na',
bottom_percent = 0.01, top_percent = 0.01, method = "bottom_top")
summary(df_treated3)
########################################################
### PREPARING OUTLIERS FOR PREDICTIVE MODELING
########################################################
data_prep_h=funModeling::prep_outliers(data = heart_disease,
input = c('age','resting_blood_pressure'),
method = "hampel", type='stop')
# Using Hampel method to flag outliers:
summary(heart_disease$age);summary(data_prep_h$age)
# it changed from 29 to 29.31, and the max remains the same at 77
hampel_outlier(heart_disease$age) # checking the thresholds
data_prep_a=funModeling::prep_outliers(data = heart_disease,
input = c('age','resting_blood_pressure'),
method = "tukey", type='stop')
max(heart_disease$age);max(data_prep_a$age)
# remains the same (77) because the max thers for age is 100
tukey_outlier(heart_disease$age)
# }
Run the code above in your browser using DataLab