R/brm.R

#' Fit Bayesian Generalized (Non-)Linear Multivariate Multilevel Models
#' 
#' Fit Bayesian generalized (non-)linear multivariate multilevel models 
#' using Stan for full Bayesian inference. A wide range of distributions 
#' and link functions are supported, allowing users to fit -- among others -- 
#' linear, robust linear, count data, survival, response times, ordinal, 
#' zero-inflated, hurdle, and even self-defined mixture models all in a 
#' multilevel context. Further modeling options include non-linear and 
#' smooth terms, auto-correlation structures, censored data, meta-analytic 
#' standard errors, and quite a few more. In addition, all parameters of the 
#' response distributions can be predicted in order to perform distributional 
#' regression. Prior specifications are flexible and explicitly encourage 
#' users to apply prior distributions that actually reflect their beliefs.
#' In addition, model fit can easily be assessed and compared with
#' posterior predictive checks and leave-one-out cross-validation.
#' 
#' @param formula An object of class \code{\link[stats:formula]{formula}},
#'   \code{\link{brmsformula}}, or \code{\link{mvbrmsformula}} (or one that can
#'   be coerced to that classes): A symbolic description of the model to be
#'   fitted. The details of model specification are explained in
#'   \code{\link{brmsformula}}.
#' @param data An object of class \code{data.frame} (or one that can be coerced
#'   to that class) containing data of all variables used in the model.
#' @param family A description of the response distribution and link function to
#'   be used in the model. This can be a family function, a call to a family
#'   function or a character string naming the family. Every family function has
#'   a \code{link} argument allowing to specify the link function to be applied
#'   on the response variable. If not specified, default links are used. For
#'   details of supported families see \code{\link{brmsfamily}}. By default, a
#'   linear \code{gaussian} model is applied. In multivariate models,
#'   \code{family} might also be a list of families.
#' @param prior One or more \code{brmsprior} objects created by
#'   \code{\link{set_prior}} or related functions and combined using the
#'   \code{c} method or the \code{+} operator. See also  \code{\link{get_prior}}
#'   for more help.
#' @param data2 A named \code{list} of objects containing data, which
#'   cannot be passed via argument \code{data}. Required for some objects 
#'   used in autocorrelation structures to specify dependency structures
#'   as well as for within-group covariance matrices.
#' @param autocor (Deprecated) An optional \code{\link{cor_brms}} object
#'   describing the correlation structure within the response variable (i.e.,
#'   the 'autocorrelation'). See the documentation of \code{\link{cor_brms}} for
#'   a description of the available correlation structures. Defaults to
#'   \code{NULL}, corresponding to no correlations. In multivariate models,
#'   \code{autocor} might also be a list of autocorrelation structures.
#'   It is now recommend to specify autocorrelation terms directly
#'   within \code{formula}. See \code{\link{brmsformula}} for more details.
#' @param sparse (Deprecated) Logical; indicates whether the population-level
#'   design matrices should be treated as sparse (defaults to \code{FALSE}). For
#'   design matrices with many zeros, this can considerably reduce required
#'   memory. Sampling speed is currently not improved or even slightly
#'   decreased. It is now recommended to use the \code{sparse} argument of
#'   \code{\link{brmsformula}} and related functions.
#' @param cov_ranef (Deprecated) A list of matrices that are proportional to the
#'   (within) covariance structure of the group-level effects. The names of the
#'   matrices should correspond to columns in \code{data} that are used as
#'   grouping factors. All levels of the grouping factor should appear as
#'   rownames of the corresponding matrix. This argument can be used, among
#'   others to model pedigrees and phylogenetic effects. 
#'   It is now recommended to specify those matrices in the formula
#'   interface using the \code{\link{gr}} and related functions. See
#'   \code{vignette("brms_phylogenetics")} for more details.
#' @param save_pars An object generated by \code{\link{save_pars}} controlling
#'   which parameters should be saved in the model. The argument has no
#'   impact on the model fitting itself.
#' @param save_ranef (Deprecated) A flag to indicate if group-level effects for
#'   each level of the grouping factor(s) should be saved (default is
#'   \code{TRUE}). Set to \code{FALSE} to save memory. The argument has no
#'   impact on the model fitting itself.
#' @param save_mevars (Deprecated) A flag to indicate if samples of latent
#'   noise-free variables obtained by using \code{me} and \code{mi} terms should
#'   be saved (default is \code{FALSE}). Saving these samples allows to better
#'   use methods such as \code{predict} with the latent variables but leads to
#'   very large \R objects even for models of moderate size and complexity.
#' @param save_all_pars (Deprecated) A flag to indicate if samples from all
#'   variables defined in Stan's \code{parameters} block should be saved
#'   (default is \code{FALSE}). Saving these samples is required in order to
#'   apply the methods \code{bridge_sampler}, \code{bayes_factor}, and
#'   \code{post_prob}.
#' @param sample_prior Indicate if samples from priors should be drawn
#'   additionally to the posterior samples. Options are \code{"no"} (the
#'   default), \code{"yes"}, and \code{"only"}. Among others, these samples can
#'   be used to calculate Bayes factors for point hypotheses via
#'   \code{\link{hypothesis}}. Please note that improper priors are not sampled,
#'   including the default improper priors used by \code{brm}. See
#'   \code{\link{set_prior}} on how to set (proper) priors. Please also note
#'   that prior samples for the overall intercept are not obtained by default
#'   for technical reasons. See \code{\link{brmsformula}} how to obtain prior
#'   samples for the intercept. If \code{sample_prior} is set to \code{"only"},
#'   samples are drawn solely from the priors ignoring the likelihood, which
#'   allows among others to generate samples from the prior predictive
#'   distribution. In this case, all parameters must have proper priors.
#' @param knots Optional list containing user specified knot values to be used
#'   for basis construction of smoothing terms. See
#'   \code{\link[mgcv:gamm]{gamm}} for more details.
#' @param stanvars An optional \code{stanvars} object generated by function
#'   \code{\link{stanvar}} to define additional variables for use in
#'   \pkg{Stan}'s program blocks.
#' @param stan_funs (Deprecated) An optional character string containing
#'   self-defined  \pkg{Stan} functions, which will be included in the functions
#'   block of the generated \pkg{Stan} code. It is now recommended to use the
#'   \code{stanvars} argument for this purpose instead.
#' @param fit An instance of S3 class \code{brmsfit} derived from a previous
#'   fit; defaults to \code{NA}. If \code{fit} is of class \code{brmsfit}, the
#'   compiled model associated with the fitted result is re-used and all
#'   arguments modifying the model code or data are ignored. It is not
#'   recommended to use this argument directly, but to call the
#'   \code{\link[brms:update.brmsfit]{update}} method, instead.
#' @param inits Either \code{"random"} or \code{"0"}. If inits is
#'   \code{"random"} (the default), Stan will randomly generate initial values
#'   for parameters. If it is \code{"0"}, all parameters are initialized to
#'   zero. This option is sometimes useful for certain families, as it happens
#'   that default (\code{"random"}) inits cause samples to be essentially
#'   constant. Generally, setting \code{inits = "0"} is worth a try, if chains
#'   do not behave well. Alternatively, \code{inits} can be a list of lists
#'   containing the initial values, or a function (or function name) generating
#'   initial values. The latter options are mainly implemented for internal
#'   testing but are available to users if necessary. If specifying initial 
#'   values using a list or a function then currently the parameter names must
#'   correspond to the names used in the generated Stan code (not the names
#'   used in \R). For more details on specifying initial values you can consult 
#'   the documentation of the selected \code{backend}.
#' @param chains Number of Markov chains (defaults to 4).
#' @param iter Number of total iterations per chain (including warmup; defaults
#'   to 2000).
#' @param warmup A positive integer specifying number of warmup (aka burnin)
#'   iterations. This also specifies the number of iterations used for stepsize
#'   adaptation, so warmup samples should not be used for inference. The number
#'   of warmup should not be larger than \code{iter} and the default is
#'   \code{iter/2}.
#' @param thin Thinning rate. Must be a positive integer. Set \code{thin > 1} to
#'   save memory and computation time if \code{iter} is large.
#' @param cores Number of cores to use when executing the chains in parallel,
#'   which defaults to 1 but we recommend setting the \code{mc.cores} option to
#'   be as many processors as the hardware and RAM allow (up to the number of
#'   chains). For non-Windows OS in non-interactive \R sessions, forking is used
#'   instead of PSOCK clusters.
#' @param threads Number of threads to use in within-chain parallelization. For
#'   more control over the threading process, \code{threads} may also be a
#'   \code{brmsthreads} object created by \code{\link{threading}}. Within-chain
#'   parallelization is experimental! We recommend its use only if you are
#'   experienced with Stan's \code{reduce_sum} function and have a slow running
#'   model that cannot be sped up by any other means.
#' @param algorithm Character string naming the estimation approach to use.
#'   Options are \code{"sampling"} for MCMC (the default), \code{"meanfield"} for
#'   variational inference with independent normal distributions,
#'   \code{"fullrank"} for variational inference with a multivariate normal
#'   distribution, or \code{"fixed_param"} for sampling from fixed parameter
#'   values. Can be set globally for the current \R session via the
#'   \code{"brms.algorithm"} option (see \code{\link{options}}).
#' @param backend Character string naming the package to use as the backend for
#'   fitting the Stan model. Options are \code{"rstan"} (the default) or
#'   \code{"cmdstanr"}. Can be set globally for the current \R session via the
#'   \code{"brms.backend"} option (see \code{\link{options}}). Details on the
#'   \pkg{rstan} and \pkg{cmdstanr} packages are available at
#'   \url{https://mc-stan.org/rstan/} and \url{https://mc-stan.org/cmdstanr/},
#'   respectively.
#' @param control A named \code{list} of parameters to control the sampler's
#'   behavior. It defaults to \code{NULL} so all the default values are used.
#'   The most important control parameters are discussed in the 'Details'
#'   section below. For a comprehensive overview see
#'   \code{\link[rstan:stan]{stan}}.
#' @param future Logical; If \code{TRUE}, the \pkg{\link[future:future]{future}}
#'   package is used for parallel execution of the chains and argument
#'   \code{cores} will be ignored. Can be set globally for the current \R
#'   session via the \code{future} option. The execution type is controlled via
#'   \code{\link[future:plan]{plan}} (see the examples section below).
#' @param silent Logical; If \code{TRUE} (the default), most of the
#'   informational messages of compiler and sampler are suppressed. The actual
#'   sampling progress is still printed. Set \code{refresh = 0} to turn this off
#'   as well. If using \code{backend = "rstan"} you can also set
#'   \code{open_progress = FALSE} to prevent opening additional progress bars.
#' @param seed The seed for random number generation to make results
#'   reproducible. If \code{NA} (the default), \pkg{Stan} will set the seed
#'   randomly.
#' @param save_model Either \code{NULL} or a character string. In the latter
#'   case, the model's Stan code is saved via \code{\link{cat}} in a text file
#'   named after the string supplied in \code{save_model}.
#' @param file Either \code{NULL} or a character string. In the latter case, the
#'   fitted model object is saved via \code{\link{saveRDS}} in a file named
#'   after the string supplied in \code{file}. The \code{.rds} extension is
#'   added automatically. If the file already exists, \code{brm} will load and
#'   return the saved model object instead of refitting the model. As existing
#'   files won't be overwritten, you have to manually remove the file in order
#'   to refit and save the model under an existing file name. The file name
#'   is stored in the \code{brmsfit} object for later usage.
#' @param empty Logical. If \code{TRUE}, the Stan model is not created
#'   and compiled and the corresponding \code{'fit'} slot of the \code{brmsfit}
#'   object will be empty. This is useful if you have estimated a brms-created
#'   Stan model outside of \pkg{brms} and want to feed it back into the package.
#' @param rename For internal use only. 
#' @param stan_model_args A \code{list} of further arguments passed to
#'   \code{\link[rstan:stan_model]{stan_model}}.
#' @param ... Further arguments passed to Stan. 
#'   For \code{backend = "rstan"} the arguments are passed to
#'   \code{\link[rstan:stanmodel-method-sampling]{sampling}} or
#'   \code{\link[rstan:stanmodel-method-vb]{vb}}.
#'   For \code{backend = "cmdstanr"} the arguments are passed to the
#'   \code{cmdstanr::sample} or \code{cmdstanr::variational} method.
#' 
#' @return An object of class \code{brmsfit}, which contains the posterior
#'   samples along with many other useful information about the model. Use
#'   \code{methods(class = "brmsfit")} for an overview on available methods.
#'  
#' @author Paul-Christian Buerkner \email{paul.buerkner@@gmail.com}
#'
#' @details Fit a generalized (non-)linear multivariate multilevel model via
#'   full Bayesian inference using Stan. A general overview is provided in the
#'   vignettes \code{vignette("brms_overview")} and
#'   \code{vignette("brms_multilevel")}. For a full list of available vignettes
#'   see \code{vignette(package = "brms")}.
#'
#'   \bold{Formula syntax of brms models}
#'
#'   Details of the formula syntax applied in \pkg{brms} can be found in
#'   \code{\link{brmsformula}}.
#'
#'   \bold{Families and link functions}
#'
#'   Details of families supported by \pkg{brms} can be found in
#'   \code{\link{brmsfamily}}.
#'
#'   \bold{Prior distributions}
#'
#'   Priors should be specified using the
#'   \code{\link[brms:set_prior]{set_prior}} function. Its documentation
#'   contains detailed information on how to correctly specify priors. To find
#'   out on which parameters or parameter classes priors can be defined, use
#'   \code{\link[brms:get_prior]{get_prior}}. Default priors are chosen to be
#'   non or very weakly informative so that their influence on the results will
#'   be negligible and you usually don't have to worry about them. However,
#'   after getting more familiar with Bayesian statistics, I recommend you to
#'   start thinking about reasonable informative priors for your model
#'   parameters: Nearly always, there is at least some prior information
#'   available that can be used to improve your inference.
#'
#'   \bold{Adjusting the sampling behavior of \pkg{Stan}}
#'
#'   In addition to choosing the number of iterations, warmup samples, and
#'   chains, users can control the behavior of the NUTS sampler, by using the
#'   \code{control} argument. The most important reason to use \code{control} is
#'   to decrease (or eliminate at best) the number of divergent transitions that
#'   cause a bias in the obtained posterior samples. Whenever you see the
#'   warning "There were x divergent transitions after warmup." you should
#'   really think about increasing \code{adapt_delta}. To do this, write
#'   \code{control = list(adapt_delta = <x>)}, where \code{<x>} should usually
#'   be value between \code{0.8} (current default) and \code{1}. Increasing
#'   \code{adapt_delta} will slow down the sampler but will decrease the number
#'   of divergent transitions threatening the validity of your posterior
#'   samples.
#'
#'   Another problem arises when the depth of the tree being evaluated in each
#'   iteration is exceeded. This is less common than having divergent
#'   transitions, but may also bias the posterior samples. When it happens,
#'   \pkg{Stan} will throw out a warning suggesting to increase
#'   \code{max_treedepth}, which can be accomplished by writing \code{control =
#'   list(max_treedepth = <x>)} with a positive integer \code{<x>} that should
#'   usually be larger than the current default of \code{10}. For more details
#'   on the \code{control} argument see \code{\link[rstan:stan]{stan}}.
#'
#' @references 
#' Paul-Christian Buerkner (2017). brms: An R Package for Bayesian Multilevel 
#' Models Using Stan. \emph{Journal of Statistical Software}, 80(1), 1-28. 
#' \code{doi:10.18637/jss.v080.i01}
#' 
#' Paul-Christian Buerkner (2018). Advanced Bayesian Multilevel Modeling 
#' with the R Package brms. \emph{The R Journal}. 10(1), 395–411. 
#' \code{doi:10.32614/RJ-2018-017}
#'
#' @seealso \code{\link{brms}}, \code{\link{brmsformula}},
#' \code{\link{brmsfamily}}, \code{\link{brmsfit}}
#'
#' @examples
#' \dontrun{
#' # Poisson regression for the number of seizures in epileptic patients
#' # using normal priors for population-level effects
#' # and half-cauchy priors for standard deviations of group-level effects
#' prior1 <- prior(normal(0,10), class = b) +
#'   prior(cauchy(0,2), class = sd)
#' fit1 <- brm(count ~ zAge + zBase * Trt + (1|patient),
#'             data = epilepsy, family = poisson(), prior = prior1)
#'
#' # generate a summary of the results
#' summary(fit1)
#'
#' # plot the MCMC chains as well as the posterior distributions
#' plot(fit1, ask = FALSE)
#'
#' # predict responses based on the fitted model
#' head(predict(fit1))
#'
#' # plot conditional effects for each predictor
#' plot(conditional_effects(fit1), ask = FALSE)
#'
#' # investigate model fit
#' loo(fit1)
#' pp_check(fit1)
#'
#'
#' # Ordinal regression modeling patient's rating of inhaler instructions
#' # category specific effects are estimated for variable 'treat'
#' fit2 <- brm(rating ~ period + carry + cs(treat),
#'             data = inhaler, family = sratio("logit"),
#'             prior = set_prior("normal(0,5)"), chains = 2)
#' summary(fit2)
#' plot(fit2, ask = FALSE)
#' WAIC(fit2)
#'
#'
#' # Survival regression modeling the time between the first
#' # and second recurrence of an infection in kidney patients.
#' fit3 <- brm(time | cens(censored) ~ age * sex + disease + (1|patient),
#'             data = kidney, family = lognormal())
#' summary(fit3)
#' plot(fit3, ask = FALSE)
#' plot(conditional_effects(fit3), ask = FALSE)
#'
#'
#' # Probit regression using the binomial family
#' ntrials <- sample(1:10, 100, TRUE)
#' success <- rbinom(100, size = ntrials, prob = 0.4)
#' x <- rnorm(100)
#' data4 <- data.frame(ntrials, success, x)
#' fit4 <- brm(success | trials(ntrials) ~ x, data = data4,
#'             family = binomial("probit"))
#' summary(fit4)
#'
#'
#' # Non-linear Gaussian model
#' fit5 <- brm(
#'   bf(cum ~ ult * (1 - exp(-(dev/theta)^omega)),
#'      ult ~ 1 + (1|AY), omega ~ 1, theta ~ 1, 
#'      nl = TRUE),
#'   data = loss, family = gaussian(),
#'   prior = c(
#'     prior(normal(5000, 1000), nlpar = "ult"),
#'     prior(normal(1, 2), nlpar = "omega"),
#'     prior(normal(45, 10), nlpar = "theta")
#'   ),
#'   control = list(adapt_delta = 0.9)
#' )
#' summary(fit5)
#' conditional_effects(fit5)
#'
#'
#' # Normal model with heterogeneous variances
#' data_het <- data.frame(
#'   y = c(rnorm(50), rnorm(50, 1, 2)),
#'   x = factor(rep(c("a", "b"), each = 50))
#' )
#' fit6 <- brm(bf(y ~ x, sigma ~ 0 + x), data = data_het)
#' summary(fit6)
#' plot(fit6)
#' conditional_effects(fit6)
#'
#' # extract estimated residual SDs of both groups
#' sigmas <- exp(posterior_samples(fit6, "^b_sigma_"))
#' ggplot(stack(sigmas), aes(values)) +
#'   geom_density(aes(fill = ind))
#'
#'
#' # Quantile regression predicting the 25%-quantile
#' fit7 <- brm(bf(y ~ x, quantile = 0.25), data = data_het,
#'             family = asym_laplace())
#' summary(fit7)
#' conditional_effects(fit7)
#'
#'
#' # use the future package for more flexible parallelization
#' library(future)
#' plan(multiprocess)
#' fit7 <- update(fit7, future = TRUE)
#' 
#' 
#' # fit a model manually via rstan
#' scode <- make_stancode(count ~ Trt, data = epilepsy)
#' sdata <- make_standata(count ~ Trt, data = epilepsy)
#' stanfit <- rstan::stan(model_code = scode, data = sdata)
#' # feed the Stan model back into brms
#' fit8 <- brm(count ~ Trt, data = epilepsy, empty = TRUE)
#' fit8$fit <- stanfit
#' fit8 <- rename_pars(fit8)
#' summary(fit8)
#' }
#'
#' @import parallel
#' @import methods
#' @import stats
#' @import Rcpp
#' @export
brm <- function(formula, data, family = gaussian(), prior = NULL, 
                autocor = NULL, data2 = NULL, cov_ranef = NULL, 
                sample_prior = "no", sparse = NULL, knots = NULL,
                stanvars = NULL, stan_funs = NULL, fit = NA, 
                save_pars = NULL, save_ranef = NULL, 
                save_mevars = NULL, save_all_pars = NULL, 
                inits = "random", chains = 4, iter = 2000, 
                warmup = floor(iter / 2), thin = 1,
                cores = getOption("mc.cores", 1), 
                threads = NULL, control = NULL,
                algorithm = getOption("brms.algorithm", "sampling"),
                backend = getOption("brms.backend", "rstan"),
                future = getOption("future", FALSE), silent = TRUE, 
                seed = NA, save_model = NULL, stan_model_args = list(),
                file = NULL, empty = FALSE, rename = TRUE, ...) {
  
  # optionally load brmsfit from file
  if (!is.null(file)) {
    x <- read_brmsfit(file)
    if (!is.null(x)) {
      return(x)
    }
  }
  
  # validate arguments later passed to Stan
  algorithm <- match.arg(algorithm, algorithm_choices())
  backend <- match.arg(backend, backend_choices())
  silent <- as_one_logical(silent)
  iter <- as_one_numeric(iter)
  warmup <- as_one_numeric(warmup)
  thin <- as_one_numeric(thin)
  chains <- as_one_numeric(chains)
  cores <- as_one_numeric(cores)
  threads <- validate_threads(threads)
  future <- as_one_logical(future) && chains > 0L
  seed <- as_one_numeric(seed, allow_na = TRUE)
  empty <- as_one_logical(empty)
  rename <- as_one_logical(rename)
  
  # initialize brmsfit object
  if (is.brmsfit(fit)) {
    # re-use existing model
    x <- fit
    x$criteria <- list()
    backend <- x$backend
    sdata <- standata(x)
    model <- compiled_model(x)
    exclude <- exclude_pars(x)
  } else {  
    # build new model
    formula <- validate_formula(
      formula, data = data, family = family, 
      autocor = autocor, sparse = sparse,
      cov_ranef = cov_ranef
    )
    family <- get_element(formula, "family")
    bterms <- brmsterms(formula)
    data_name <- substitute_name(data)
    data <- validate_data(data, bterms = bterms, knots = knots)
    attr(data, "data_name") <- data_name
    data2 <- validate_data2(
      data2, bterms = bterms, 
      get_data2_autocor(formula),
      get_data2_cov_ranef(formula)
    )
    prior <- validate_prior(
      prior, bterms = bterms, data = data,
      sample_prior = sample_prior
    )
    stanvars <- validate_stanvars(stanvars, stan_funs = stan_funs)
    save_pars <- validate_save_pars(
      save_pars, save_ranef = save_ranef, 
      save_mevars = save_mevars,
      save_all_pars = save_all_pars
    )
    ranef <- tidy_ranef(bterms, data = data)
    # generate Stan code
    model <- .make_stancode(
      bterms, data = data, prior = prior, 
      stanvars = stanvars, save_model = save_model,
      backend = backend, threads = threads
    )
    # initialize S3 object
    x <- brmsfit(
      formula = formula, data = data, data2 = data2, prior = prior, 
      stanvars = stanvars, model = model, algorithm = algorithm, 
      backend = backend, threads = threads, save_pars = save_pars, 
      ranef = ranef, family = family
    )
    exclude <- exclude_pars(x)
    # generate Stan data before compiling the model to avoid
    # unnecessary compilations in case of invalid data
    sdata <- .make_standata(
      bterms, data = data, prior = prior, data2 = data2, 
      stanvars = stanvars, threads = threads
    )
    if (empty) {
      # return the brmsfit object with an empty 'fit' slot
      return(x)
    }
    # compile the Stan model
    compile_args <- stan_model_args
    compile_args$model <- model
    compile_args$backend <- backend
    compile_args$threads <- threads
    model <- do_call(compile_model, compile_args)
  }
  
  # fit the Stan model
  fit_args <- nlist(
    model, sdata, algorithm, backend, iter, warmup, thin, chains, 
    cores, threads, inits, exclude, control, future, seed, silent, ...
  )
  x$fit <- do_call(fit_model, fit_args)

  # rename parameters to have human readable names
  if (rename) {
    x <- rename_pars(x) 
  }
  if (!is.null(file)) {
    write_brmsfit(x, file)
  }
  x
}