Learn R Programming

dynamichazard (version 1.0.2)

PF_EM: EM Estimation with Particle Filters and Smoothers

Description

Method to estimate the hyper parameters with an EM algorithm.

Usage

PF_EM(
  formula,
  data,
  model = "logit",
  by,
  max_T,
  id,
  a_0,
  Q_0,
  Q,
  order = 1,
  control = PF_control(...),
  trace = 0,
  seed = NULL,
  type = "RW",
  fixed = NULL,
  random = NULL,
  Fmat,
  fixed_effects,
  G,
  theta,
  J,
  K,
  psi,
  phi,
  ...
)

Value

An object of class PF_EM.

Arguments

formula

coxph like formula with Surv(tstart, tstop, event) on the left hand site of ~.

data

data.frame or environment containing the outcome and covariates.

model

either 'logit' for binary outcomes with the logistic link function, 'cloglog' for binary outcomes with the inverse cloglog link function, or 'exponential' for piecewise constant exponential distributed arrival times.

by

interval length of the bins in which parameters are fixed.

max_T

end of the last interval interval.

id

vector of ids for each row of the in the design matrix.

a_0

vector \(a_0\) for the initial coefficient vector for the first iteration (optional). Default is estimates from static model (see static_glm).

Q_0

covariance matrix for the prior distribution.

Q

initial covariance matrix for the state equation.

order

order of the random walk.

control

see PF_control.

trace

argument to get progress information. Zero will yield no info and larger integer values will yield incrementally more information.

seed

seed to set at the start of every EM iteration. See set.seed.

type

type of state model. Either "RW" for a [R]andom [W]alk or "VAR" for [V]ector [A]uto[R]egression.

fixed

two-sided formula to be used with random instead of formula. It is of the form Surv(tstart, tstop, event) ~ x or Surv(tstart, tstop, event) ~ - 1 for no fixed effects.

random

one-sided formula to be used with fixed instead of formula. It is of the form ~ z.

Fmat

starting value for \(F\) when type = "VAR". See 'Details' in PF_EM.

fixed_effects

starting values for fixed effects if any. See ddFixed.

G, theta, J, K, psi, phi

parameters for a restricted type = "VAR" model. See the vignette mentioned in 'Details' of PF_EM and the examples linked to in 'See Also'.

...

optional way to pass arguments to control.

Warning

The function is still under development so the output and API may change.

Details

Estimates a state model of the form

$$\alpha_t = F \alpha_t + R\epsilon_t, \qquad \epsilon_t \sim N(0, Q)$$

where \(F\in{\rm I\!R}^{p\times p}\) has full rank, \(\alpha_t\in {\rm I\!R}^p\), \(\epsilon_t\in {\rm I\!R}^r\), \(r \leq p\), and \(R = (e_{l_1},e_{l_2},\dots,e_{l_r})\) where \(e_k\) is column from the \(p\) dimensional identity matrix and \(l_1<l_2<\dots<l_r\). The time zero state is drawn from

$$\alpha_0\sim N(a_0, Q_0)$$

with \(Q_0 \in {\rm I\!R}^{p\times p}\). The latent states, \(\alpha_t\), are related to the output through the linear predictors

$$\eta_{it} = X_t(R^+\alpha_t) + Z_t\beta$$

where \(X_t\in{\rm I\!R}^{n_t\times r}\) and \(Z_t{\rm I\!R}^{n_t\times c}\) are design matrices and the outcome for a individual \(i\) at time \(t\) is distributed according to an exponential family member given \(\eta_{it}\). \(\beta\) are constant coefficients.

See vignette("Particle_filtering", "dynamichazard") for details.

See Also

PF_forward_filter to get a more precise estimate of the final log-likelihood.

See the examples at https://github.com/boennecd/dynamichazard/tree/master/examples.

Examples

Run this code
# \donttest{
#####
# Fit model with lung data set from survival
# Warning: long-ish computation time

library(dynamichazard)
.lung <- lung[!is.na(lung$ph.ecog), ]
# standardize
.lung$age <- scale(.lung$age)

# fit
set.seed(43588155)
pf_fit <- PF_EM(
 Surv(time, status == 2) ~ ddFixed(ph.ecog) + age,
 data = .lung, by = 50, id = 1:nrow(.lung),
 Q_0 = diag(1, 2), Q = diag(.5^2, 2),
 max_T = 800,
 control = PF_control(
    # these number should be larger! Small for CRAN checks
    N_fw_n_bw = 100L, N_first = 250L, N_smooth = 100L,
    n_max = 50, eps = .001, Q_tilde = diag(.2^2, 2), est_a_0 = FALSE,
    n_threads = 2))

# Plot state vector estimates
plot(pf_fit, cov_index = 1)
plot(pf_fit, cov_index = 2)

# Plot log-likelihood
plot(pf_fit$log_likes)
# }
# \donttest{
######
# example with fixed intercept

# prepare data
temp <- subset(pbc, id <= 312, select=c(id, sex, time, status, edema, age))
pbc2 <- tmerge(temp, temp, id=id, death = event(time, status))
pbc2 <- tmerge(pbc2, pbcseq, id=id, albumin = tdc(day, albumin),
              protime = tdc(day, protime), bili = tdc(day, bili))
pbc2 <- pbc2[, c("id", "tstart", "tstop", "death", "sex", "edema",
                "age", "albumin", "protime", "bili")]
pbc2 <- within(pbc2, {
 log_albumin <- log(albumin)
 log_protime <- log(protime)
 log_bili <- log(bili)
})

# standardize
for(c. in c("age", "log_albumin", "log_protime", "log_bili"))
 pbc2[[c.]] <- drop(scale(pbc2[[c.]]))

# fit model with extended Kalman filter
ddfit <- ddhazard(
 Surv(tstart, tstop, death == 2) ~ ddFixed_intercept() + ddFixed(age) +
   ddFixed(edema) + ddFixed(log_albumin) + ddFixed(log_protime) + log_bili,
 pbc2, Q_0 = 100, Q = 1e-2, by = 100, id = pbc2$id,
 model = "exponential", max_T = 3600,
 control = ddhazard_control(eps = 1e-5, NR_eps = 1e-4, n_max = 1e4))
summary(ddfit)

# fit model with particle filter
set.seed(88235076)
pf_fit <- PF_EM(
  Surv(tstart, tstop, death == 2) ~ ddFixed_intercept() + ddFixed(age) +
    ddFixed(edema) + ddFixed(log_albumin) + ddFixed(log_protime) + log_bili,
  pbc2, Q_0 = 2^2, Q = ddfit$Q * 100, # use estimate from before
  by = 100, id = pbc2$id,
  model = "exponential", max_T = 3600,
  control = PF_control(
    # these number should be larger! Small for CRAN checks
    N_fw_n_bw = 100, N_smooth = 250, N_first = 100, eps = 1e-3,
    method = "AUX_normal_approx_w_cloud_mean", est_a_0 = FALSE,
    Q_tilde = as.matrix(.1^2),
    n_max = 25, # just take a few iterations as an example
    n_threads = 2))

# compare results
plot(ddfit)
plot(pf_fit)
sqrt(ddfit$Q * 100)
sqrt(pf_fit$Q)
rbind(ddfit$fixed_effects, pf_fit$fixed_effects)
# }
# \donttest{
#####
# simulation example with `random` and `fixed` argument and a restricted
# model

# g groups with k individuals in each
g <- 3L
k <- 400L

# matrices for state equation
p <- g + 1L
G <- matrix(0., p^2, 2L)
for(i in 1:p)
  G[i + (i - 1L) * p, 1L + (i == p)] <- 1L

theta <- c(.9, .8)
# coefficients in transition density
(F. <- matrix(as.vector(G %*% theta), 4L, 4L))

J <- matrix(0., ncol = 2L, nrow = p)
J[-p, 1L] <- J[p, 2L] <- 1
psi <- c(log(c(.3, .1)))

K <- matrix(0., p * (p - 1L) / 2L, 2L)
j <- 0L
for(i in (p - 1L):1L){
  j <- j + i
  K[j, 2L] <- 1
}
K[K[, 2L] < 1, 1L] <- 1
phi <- log(-(c(.8, .3) + 1) / (c(.8, .3) - 1))

V <- diag(exp(drop(J %*% psi)))
C <- diag(1, ncol(V))
C[lower.tri(C)] <- 2/(1 + exp(-drop(K %*% phi))) - 1
C[upper.tri(C)] <- t(C)[upper.tri(C)]
(Q <- V %*% C %*% V)     # covariance matrix in transition density
cov2cor(Q)

Q_0 <- get_Q_0(Q, F.)    # time-invariant covariance matrix
beta <- c(rep(-6, g), 0) # all groups have the same long run mean intercept

# simulate state variables
set.seed(56219373)
n_periods <- 300L
alphas <- matrix(nrow = n_periods + 1L, ncol = p)
alphas[1L, ] <- rnorm(p) %*% chol(Q_0)
for(i in 1:n_periods + 1L)
  alphas[i, ] <- F. %*% alphas[i - 1L, ] + drop(rnorm(p) %*% chol(Q))

alphas <- t(t(alphas) + beta)

# plot state variables
matplot(alphas, type = "l", lty = 1)

# simulate individuals' outcome
n_obs <- g * k
df <- lapply(1:n_obs, function(i){
  # find the group
  grp <- (i - 1L) %/% (n_obs / g) + 1L

  # left-censoring
  tstart <- max(0L, sample.int((n_periods - 1L) * 2L, 1) - n_periods + 1L)

  # covariates
  x <- c(1, rnorm(1))

  # outcome (stop time and event indicator)
  osa <- NULL
  oso <- NULL
  osx <- NULL
  y <- FALSE
  for(tstop in (tstart + 1L):n_periods){
    sigmoid <- 1 / (1 + exp(- drop(x %*% alphas[tstop + 1L, c(grp, p)])))
    if(sigmoid > runif(1)){
      y <- TRUE
      break
    }
    if(.01 > runif(1L) && tstop < n_periods){
      # sample new covariate
      osa <- c(osa, tstart)
      tstart <- tstop
      oso <- c(oso, tstop)
      osx <- c(osx, x[2])
      x[2] <- rnorm(1)
    }
  }

  cbind(
    tstart = c(osa, tstart), tstop = c(oso, tstop),
    x = c(osx, x[2]), y = c(rep(FALSE, length(osa)), y), grp = grp,
    id = i)
})
df <- data.frame(do.call(rbind, df))
df$grp <- factor(df$grp)

# fit model. Start with "cheap" iterations
fit <- PF_EM(
  fixed = Surv(tstart, tstop, y) ~ x, random = ~ grp + x - 1,
  data = df, model = "logit", by = 1L, max_T = max(df$tstop),
  Q_0 = diag(1.5^2, p), id = df$id, type = "VAR",
  G = G, theta = c(.5, .5), J = J, psi = log(c(.1, .1)),
  K = K, phi = log(-(c(.4, 0) + 1) / (c(.4, 0) - 1)),
  control = PF_control(
    N_fw_n_bw = 100L, N_smooth = 100L, N_first = 500L,
    method = "AUX_normal_approx_w_cloud_mean",
    nu = 5L, # sample from multivariate t-distribution
    n_max = 60L,  averaging_start = 50L,
    smoother = "Fearnhead_O_N", eps = 1e-4, covar_fac = 1.2,
    n_threads = 2L # depends on your cpu(s)
  ),
  trace = 1L)
plot(fit$log_likes) # log-likelihood approximation at each iterations

# you can take more iterations by uncommenting the following
# cl <- fit$call
# ctrl <- cl[["control"]]
# ctrl[c("N_fw_n_bw", "N_smooth", "N_first", "n_max",
#        "averaging_start")] <- list(500L, 2000L, 5000L, 200L, 30L)
# cl[["control"]] <- ctrl
# cl[c("phi", "psi", "theta")] <- list(fit$phi, fit$psi, fit$theta)
# fit_extra <- eval(cl)

plot(fit$log_likes) # log-likelihood approximation at each iteration

# check estimates
sqrt(diag(fit$Q))
sqrt(diag(Q))
cov2cor(fit$Q)
cov2cor(Q)
fit$F
F.

# plot predicted state variables
for(i in 1:p){
  plot(fit, cov_index = i)
  abline(h = 0, lty = 2)
  lines(1:nrow(alphas) - 1, alphas[, i] - beta[i], lty = 3)
}
# }

Run the code above in your browser using DataLab