PF_EM: EM Estimation with Particle Filters and Smoothers

Description

Method to estimate the hyper parameters with an EM algorithm.

Usage

PF_EM(
  formula,
  data,
  model = "logit",
  by,
  max_T,
  id,
  a_0,
  Q_0,
  Q,
  order = 1,
  control = PF_control(...),
  trace = 0,
  seed = NULL,
  type = "RW",
  fixed = NULL,
  random = NULL,
  Fmat,
  fixed_effects,
  G,
  theta,
  J,
  K,
  psi,
  phi,
  ...
)

Value

An object of class PF_EM.

Arguments

formula: coxph like formula with Surv(tstart, tstop, event) on the left hand site of ~.
data: data.frame or environment containing the outcome and covariates.
model: either 'logit' for binary outcomes with the logistic link function, 'cloglog' for binary outcomes with the inverse cloglog link function, or 'exponential' for piecewise constant exponential distributed arrival times.
by: interval length of the bins in which parameters are fixed.
max_T: end of the last interval interval.
id: vector of ids for each row of the in the design matrix.
a_0: vector $a_0$ for the initial coefficient vector for the first iteration (optional). Default is estimates from static model (see static_glm).
Q_0: covariance matrix for the prior distribution.
Q: initial covariance matrix for the state equation.
order: order of the random walk.
control: see PF_control.
trace: argument to get progress information. Zero will yield no info and larger integer values will yield incrementally more information.
seed: seed to set at the start of every EM iteration. See set.seed.
type: type of state model. Either "RW" for a [R]andom [W]alk or "VAR" for [V]ector [A]uto[R]egression.
fixed: two-sided formula to be used with random instead of formula. It is of the form Surv(tstart, tstop, event) ~ x or Surv(tstart, tstop, event) ~ - 1 for no fixed effects.
random: one-sided formula to be used with fixed instead of formula. It is of the form ~ z.
Fmat: starting value for $F$ when type = "VAR". See 'Details' in PF_EM.
fixed_effects: starting values for fixed effects if any. See ddFixed.
G, theta, J, K, psi, phi: parameters for a restricted type = "VAR" model. See the vignette mentioned in 'Details' of PF_EM and the examples linked to in 'See Also'.
...: optional way to pass arguments to control.

Warning

The function is still under development so the output and API may change.

Details

Estimates a state model of the form

$$\alpha_t = F \alpha_t + R\epsilon_t, \qquad \epsilon_t \sim N(0, Q)$$

where $F\in{\rm I\!R}^{p\times p}$ has full rank, $\alpha_t\in {\rm I\!R}^p$, $\epsilon_t\in {\rm I\!R}^r$, $r \leq p$, and $R = (e_{l_1},e_{l_2},\dots,e_{l_r})$ where $e_k$ is column from the $p$ dimensional identity matrix and $l_1<l_2<\dots<l_r$. The time zero state is drawn from

$$\alpha_0\sim N(a_0, Q_0)$$

with $Q_0 \in {\rm I\!R}^{p\times p}$. The latent states, $\alpha_t$, are related to the output through the linear predictors

$$\eta_{it} = X_t(R^+\alpha_t) + Z_t\beta$$

where $X_t\in{\rm I\!R}^{n_t\times r}$ and $Z_t{\rm I\!R}^{n_t\times c}$ are design matrices and the outcome for a individual $i$ at time $t$ is distributed according to an exponential family member given $\eta_{it}$. $\beta$ are constant coefficients.

See vignette("Particle_filtering", "dynamichazard") for details.

Examples

Run this code

# \donttest{
#####
# Fit model with lung data set from survival
# Warning: long-ish computation time

library(dynamichazard)
.lung <- lung[!is.na(lung$ph.ecog), ]
# standardize
.lung$age <- scale(.lung$age)

# fit
set.seed(43588155)
pf_fit <- PF_EM(
 Surv(time, status == 2) ~ ddFixed(ph.ecog) + age,
 data = .lung, by = 50, id = 1:nrow(.lung),
 Q_0 = diag(1, 2), Q = diag(.5^2, 2),
 max_T = 800,
 control = PF_control(
    # these number should be larger! Small for CRAN checks
    N_fw_n_bw = 100L, N_first = 250L, N_smooth = 100L,
    n_max = 50, eps = .001, Q_tilde = diag(.2^2, 2), est_a_0 = FALSE,
    n_threads = 2))

# Plot state vector estimates
plot(pf_fit, cov_index = 1)
plot(pf_fit, cov_index = 2)

# Plot log-likelihood
plot(pf_fit$log_likes)
# }
# \donttest{
######
# example with fixed intercept

# prepare data
temp <- subset(pbc, id <= 312, select=c(id, sex, time, status, edema, age))
pbc2 <- tmerge(temp, temp, id=id, death = event(time, status))
pbc2 <- tmerge(pbc2, pbcseq, id=id, albumin = tdc(day, albumin),
              protime = tdc(day, protime), bili = tdc(day, bili))
pbc2 <- pbc2[, c("id", "tstart", "tstop", "death", "sex", "edema",
                "age", "albumin", "protime", "bili")]
pbc2 <- within(pbc2, {
 log_albumin <- log(albumin)
 log_protime <- log(protime)
 log_bili <- log(bili)
})

# standardize
for(c. in c("age", "log_albumin", "log_protime", "log_bili"))
 pbc2[[c.]] <- drop(scale(pbc2[[c.]]))

# fit model with extended Kalman filter
ddfit <- ddhazard(
 Surv(tstart, tstop, death == 2) ~ ddFixed_intercept() + ddFixed(age) +
   ddFixed(edema) + ddFixed(log_albumin) + ddFixed(log_protime) + log_bili,
 pbc2, Q_0 = 100, Q = 1e-2, by = 100, id = pbc2$id,
 model = "exponential", max_T = 3600,
 control = ddhazard_control(eps = 1e-5, NR_eps = 1e-4, n_max = 1e4))
summary(ddfit)

# fit model with particle filter
set.seed(88235076)
pf_fit <- PF_EM(
  Surv(tstart, tstop, death == 2) ~ ddFixed_intercept() + ddFixed(age) +
    ddFixed(edema) + ddFixed(log_albumin) + ddFixed(log_protime) + log_bili,
  pbc2, Q_0 = 2^2, Q = ddfit$Q * 100, # use estimate from before
  by = 100, id = pbc2$id,
  model = "exponential", max_T = 3600,
  control = PF_control(
    # these number should be larger! Small for CRAN checks
    N_fw_n_bw = 100, N_smooth = 250, N_first = 100, eps = 1e-3,
    method = "AUX_normal_approx_w_cloud_mean", est_a_0 = FALSE,
    Q_tilde = as.matrix(.1^2),
    n_max = 25, # just take a few iterations as an example
    n_threads = 2))

# compare results
plot(ddfit)
plot(pf_fit)
sqrt(ddfit$Q * 100)
sqrt(pf_fit$Q)
rbind(ddfit$fixed_effects, pf_fit$fixed_effects)
# }
# \donttest{
#####
# simulation example with `random` and `fixed` argument and a restricted
# model

# g groups with k individuals in each
g <- 3L
k <- 400L

# matrices for state equation
p <- g + 1L
G <- matrix(0., p^2, 2L)
for(i in 1:p)
  G[i + (i - 1L) * p, 1L + (i == p)] <- 1L

theta <- c(.9, .8)
# coefficients in transition density
(F. <- matrix(as.vector(G %*% theta), 4L, 4L))

J <- matrix(0., ncol = 2L, nrow = p)
J[-p, 1L] <- J[p, 2L] <- 1
psi <- c(log(c(.3, .1)))

K <- matrix(0., p * (p - 1L) / 2L, 2L)
j <- 0L
for(i in (p - 1L):1L){
  j <- j + i
  K[j, 2L] <- 1
}
K[K[, 2L] < 1, 1L] <- 1
phi <- log(-(c(.8, .3) + 1) / (c(.8, .3) - 1))

V <- diag(exp(drop(J %*% psi)))
C <- diag(1, ncol(V))
C[lower.tri(C)] <- 2/(1 + exp(-drop(K %*% phi))) - 1
C[upper.tri(C)] <- t(C)[upper.tri(C)]
(Q <- V %*% C %*% V)     # covariance matrix in transition density
cov2cor(Q)

Q_0 <- get_Q_0(Q, F.)    # time-invariant covariance matrix
beta <- c(rep(-6, g), 0) # all groups have the same long run mean intercept

# simulate state variables
set.seed(56219373)
n_periods <- 300L
alphas <- matrix(nrow = n_periods + 1L, ncol = p)
alphas[1L, ] <- rnorm(p) %*% chol(Q_0)
for(i in 1:n_periods + 1L)
  alphas[i, ] <- F. %*% alphas[i - 1L, ] + drop(rnorm(p) %*% chol(Q))

alphas <- t(t(alphas) + beta)

# plot state variables
matplot(alphas, type = "l", lty = 1)

# simulate individuals' outcome
n_obs <- g * k
df <- lapply(1:n_obs, function(i){
  # find the group
  grp <- (i - 1L) %/% (n_obs / g) + 1L

  # left-censoring
  tstart <- max(0L, sample.int((n_periods - 1L) * 2L, 1) - n_periods + 1L)

  # covariates
  x <- c(1, rnorm(1))

  # outcome (stop time and event indicator)
  osa <- NULL
  oso <- NULL
  osx <- NULL
  y <- FALSE
  for(tstop in (tstart + 1L):n_periods){
    sigmoid <- 1 / (1 + exp(- drop(x %*% alphas[tstop + 1L, c(grp, p)])))
    if(sigmoid > runif(1)){
      y <- TRUE
      break
    }
    if(.01 > runif(1L) && tstop < n_periods){
      # sample new covariate
      osa <- c(osa, tstart)
      tstart <- tstop
      oso <- c(oso, tstop)
      osx <- c(osx, x[2])
      x[2] <- rnorm(1)
    }
  }

  cbind(
    tstart = c(osa, tstart), tstop = c(oso, tstop),
    x = c(osx, x[2]), y = c(rep(FALSE, length(osa)), y), grp = grp,
    id = i)
})
df <- data.frame(do.call(rbind, df))
df$grp <- factor(df$grp)

# fit model. Start with "cheap" iterations
fit <- PF_EM(
  fixed = Surv(tstart, tstop, y) ~ x, random = ~ grp + x - 1,
  data = df, model = "logit", by = 1L, max_T = max(df$tstop),
  Q_0 = diag(1.5^2, p), id = df$id, type = "VAR",
  G = G, theta = c(.5, .5), J = J, psi = log(c(.1, .1)),
  K = K, phi = log(-(c(.4, 0) + 1) / (c(.4, 0) - 1)),
  control = PF_control(
    N_fw_n_bw = 100L, N_smooth = 100L, N_first = 500L,
    method = "AUX_normal_approx_w_cloud_mean",
    nu = 5L, # sample from multivariate t-distribution
    n_max = 60L,  averaging_start = 50L,
    smoother = "Fearnhead_O_N", eps = 1e-4, covar_fac = 1.2,
    n_threads = 2L # depends on your cpu(s)
  ),
  trace = 1L)
plot(fit$log_likes) # log-likelihood approximation at each iterations

# you can take more iterations by uncommenting the following
# cl <- fit$call
# ctrl <- cl[["control"]]
# ctrl[c("N_fw_n_bw", "N_smooth", "N_first", "n_max",
#        "averaging_start")] <- list(500L, 2000L, 5000L, 200L, 30L)
# cl[["control"]] <- ctrl
# cl[c("phi", "psi", "theta")] <- list(fit$phi, fit$psi, fit$theta)
# fit_extra <- eval(cl)

plot(fit$log_likes) # log-likelihood approximation at each iteration

# check estimates
sqrt(diag(fit$Q))
sqrt(diag(Q))
cov2cor(fit$Q)
cov2cor(Q)
fit$F
F.

# plot predicted state variables
for(i in 1:p){
  plot(fit, cov_index = i)
  abline(h = 0, lty = 2)
  lines(1:nrow(alphas) - 1, alphas[, i] - beta[i], lty = 3)
}
# }

Run the code above in your browser using DataLab