Learn R Programming

ggRandomForests (version 2.2.1)

gg_partial: Partial variable dependence object

Description

The plot.variable function returns a list of either marginal variable dependence or partial variable dependence data from a rfsrc object. The gg_partial function formulates the plot.variable output for partial plots (where partial=TRUE) into a data object for creation of partial dependence plots using the plot.gg_partial function.

Partial variable dependence plots are the risk adjusted estimates of the specified response as a function of a single covariate, possibly subsetted on other covariates.

An option named argument can name a column for merging multiple plots together

Usage

gg_partial(object, ...)

Value

gg_partial object. A data.frame or list of data.frames corresponding the variables contained within the plot.variable output.

Arguments

object

the partial variable dependence data object from plot.variable function

...

optional arguments

References

Friedman, Jerome H. 2000. "Greedy Function Approximation: A Gradient Boosting Machine." Annals of Statistics 29: 1189-1232."

See Also

plot.gg_partial

plot.variable

Examples

Run this code
## ------------------------------------------------------------
## classification
## ------------------------------------------------------------
## -------- iris data
## iris "Petal.Width" partial dependence plot
##
rfsrc_iris <- rfsrc(Species ~., data = iris)
partial_iris <- plot.variable(rfsrc_iris, 
                              xvar.names = "Petal.Width",
                              partial=TRUE)

gg_dta <- gg_partial(partial_iris)
plot(gg_dta)

## ------------------------------------------------------------
## regression
## ------------------------------------------------------------
if (FALSE) {
## -------- air quality data
## airquality "Wind" partial dependence plot
##
rfsrc_airq <- rfsrc(Ozone ~ ., data = airquality)
partial_airq <- plot.variable(rfsrc_airq, 
                              xvar.names = "Wind",
                             partial=TRUE, show.plot=FALSE)

gg_dta <- gg_partial(partial_airq)
plot(gg_dta)

}
if (FALSE) {
## -------- Boston data
data(Boston, package = "MASS")
Boston$chas <- as.logical(Boston$chas)
rfsrc_boston <- rfsrc(medv ~ .,
   data = Boston,
   forest = TRUE,
   importance = TRUE,
   tree.err = TRUE,
   save.memory = TRUE)
   
varsel_boston <- var.select(rfsrc_boston)

partial_boston <- plot.variable(rfsrc_boston, 
  xvar.names = varsel_boston$topvars,
  sorted = FALSE,
  partial = TRUE, 
  show.plots = FALSE)
gg_dta <- gg_partial(partial_boston)
plot(gg_dta, panel=TRUE)
}
if (FALSE) {
## -------- mtcars data
rfsrc_mtcars <- rfsrc(mpg ~ ., data = mtcars)
varsel_mtcars <- var.select(rfsrc_mtcars)

partial_mtcars <- plot.variable(rfsrc_mtcars, 
  xvar.names = varsel_mtcars$topvars,
  sorted = FALSE,
  partial = TRUE, 
  show.plots = FALSE)

gg_dta <- gg_partial(partial_mtcars)

gg_dta.cat <- gg_dta
gg_dta.cat[["disp"]] <- gg_dta.cat[["wt"]] <- gg_dta.cat[["hp"]] <- NULL
gg_dta.cat[["drat"]] <- gg_dta.cat[["carb"]] <- gg_dta.cat[["qsec"]] <- NULL

plot(gg_dta.cat, panel=TRUE, notch=TRUE)

gg_dta[["cyl"]] <- gg_dta[["vs"]] <- gg_dta[["am"]] <- NULL
gg_dta[["gear"]] <- NULL
plot(gg_dta, panel=TRUE)
}

## ------------------------------------------------------------
## survival examples
## ------------------------------------------------------------
if (FALSE) {
## -------- veteran data
## survival "age" partial variable dependence plot
##
 data(veteran, package = "randomForestSRC")
 rfsrc_veteran <- rfsrc(Surv(time,status)~., veteran, nsplit = 10,
     ntree = 100)

varsel_rfsrc <- var.select(rfsrc_veteran)

## 30 day partial plot for age
partial_veteran <- plot.variable(rfsrc_veteran, surv.type = "surv",
                               partial = TRUE, time=30,
                               show.plots=FALSE)

gg_dta <- gg_partial(partial_veteran)
plot(gg_dta, panel=TRUE)

gg_dta.cat <- gg_dta
gg_dta[["celltype"]] <- gg_dta[["trt"]] <- gg_dta[["prior"]] <- NULL
plot(gg_dta, panel=TRUE)

gg_dta.cat[["karno"]] <- gg_dta.cat[["diagtime"]] <-
    gg_dta.cat[["age"]] <- NULL
plot(gg_dta.cat, panel=TRUE, notch=TRUE)

gg_dta <- lapply(partial_veteran, gg_partial)
gg_dta <- combine.gg_partial(gg_dta[[1]], gg_dta[[2]] )

plot(gg_dta[["karno"]])
plot(gg_dta[["celltype"]])

gg_dta.cat <- gg_dta
gg_dta[["celltype"]] <- gg_dta[["trt"]] <- gg_dta[["prior"]] <- NULL
plot(gg_dta, panel=TRUE)

gg_dta.cat[["karno"]] <- gg_dta.cat[["diagtime"]] <-
    gg_dta.cat[["age"]] <- NULL
plot(gg_dta.cat, panel=TRUE, notch=TRUE)

## ------------------------------------------------------------
## -------- pbc data
# We need to create this dataset
data(pbc, package = "randomForestSRC",) 
# For whatever reason, the age variable is in days... makes no sense to me
for (ind in seq_len(dim(pbc)[2])) {
 if (!is.factor(pbc[, ind])) {
   if (length(unique(pbc[which(!is.na(pbc[, ind])), ind])) <= 2) {
     if (sum(range(pbc[, ind], na.rm = TRUE) == c(0, 1)) == 2) {
       pbc[, ind] <- as.logical(pbc[, ind])
     }
   }
 } else {
   if (length(unique(pbc[which(!is.na(pbc[, ind])), ind])) <= 2) {
     if (sum(sort(unique(pbc[, ind])) == c(0, 1)) == 2) {
       pbc[, ind] <- as.logical(pbc[, ind])
     }
     if (sum(sort(unique(pbc[, ind])) == c(FALSE, TRUE)) == 2) {
       pbc[, ind] <- as.logical(pbc[, ind])
     }
   }
 }
 if (!is.logical(pbc[, ind]) &
     length(unique(pbc[which(!is.na(pbc[, ind])), ind])) <= 5) {
   pbc[, ind] <- factor(pbc[, ind])
 }
}
#Convert age to years
pbc$age <- pbc$age / 364.24

pbc$years <- pbc$days / 364.24
pbc <- pbc[, -which(colnames(pbc) == "days")]
pbc$treatment <- as.numeric(pbc$treatment)
pbc$treatment[which(pbc$treatment == 1)] <- "DPCA"
pbc$treatment[which(pbc$treatment == 2)] <- "placebo"
pbc$treatment <- factor(pbc$treatment)
dta_train <- pbc[-which(is.na(pbc$treatment)), ]
# Create a test set from the remaining patients
 pbc_test <- pbc[which(is.na(pbc$treatment)), ]

#========
# build the forest:
rfsrc_pbc <- randomForestSRC::rfsrc(
  Surv(years, status) ~ .,
 dta_train,
 nsplit = 10,
 na.action = "na.impute",
 forest = TRUE,
 importance = TRUE,
 save.memory = TRUE
)

varsel_pbc <- var.select(rfsrc_pbc)

xvar <- varsel_pbc$topvars

# Convert all partial plots to gg_partial objects
gg_dta <- lapply(partial_pbc, gg_partial)

# Combine the objects to get multiple time curves
# along variables on a single figure.
pbc_ggpart <- combine.gg_partial(gg_dta[[1]], gg_dta[[2]],
                                 lbls = c("1 Year", "3 Years"))

summary(pbc_ggpart)
class(pbc_ggpart[["bili"]])

# Plot the highest ranked variable, by name.
#plot(pbc_ggpart[["bili"]])

# Create a temporary holder and remove the stage and edema data
ggpart <- pbc_ggpart
ggpart$edema <- NULL

# Panel plot the remainder.
plot(ggpart, panel = TRUE)

plot(pbc_ggpart[["edema"]])
}

Run the code above in your browser using DataLab