Initialize quantileplot package

bstewart · May 3, 2021 · 6768e0d · 6768e0d
commit 6768e0d
Show file tree

Hide file tree

Showing 24 changed files with 2,036 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+inst/doc
+Meta
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,29 @@
+Package: quantileplot
+Version: 0.1.1
+Date: 2021-02-25
+Title: Smooth Quantile Visualization Tool 
+Description: This package creates a bivariate smooth quantile plot. The plot visualizes estimates of the marginal density of X, the conditional density of Y given X at selected values of X, and smooth curves showing trends in quantiles of Y given X as a function of X.
+Author: Robin C. Lee, Ian Lundberg, Brandon M. Stewart
+Maintainer: Robin C. Lee <[email protected]>
+RoxygenNote: 7.1.1
+Imports:
+    dplyr,
+    foreach,
+    ggplot2,
+    rlang,
+    ggthemes,
+    qgam,
+    MASS,
+    labeling,
+    stats,
+    gridExtra,
+    mgcv,
+    mvtnorm,
+    reshape2,
+    tidyr
+License: MIT + file LICENSE
+Encoding: UTF-8
+Suggests: 
+    rmarkdown,
+    knitr
+VignetteBuilder: knitr
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2021
+COPYRIGHT HOLDER: Robin C. Lee
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,23 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(plot,quantileplot)
+S3method(print,quantileplot)
+S3method(summary,quantileplot)
+export(gen_curves)
+export(gen_densities)
+export(quantileplot)
+import(dplyr)
+import(ggplot2)
+import(rlang)
+importFrom(ggthemes,scale_color_colorblind)
+importFrom(gridExtra,grid.arrange)
+importFrom(labeling,extended)
+importFrom(mgcv,PredictMat)
+importFrom(mvtnorm,rmvnorm)
+importFrom(qgam,mqgam)
+importFrom(reshape2,melt)
+importFrom(stats,density)
+importFrom(stats,dnorm)
+importFrom(stats,predict)
+importFrom(stats,qnorm)
+importFrom(stats,weighted.mean)
diff --git a/R/gen_curves.R b/R/gen_curves.R
@@ -0,0 +1,134 @@
+#' Generate Smooth Curves for a \code{quantileplot}
+#'
+#' @description Estimate smooth curves for quantiles of the outcome as a function of the predictor. This function calls \code{mqgam} from the \code{qgam} package. This function is typically called indirectly via a user call to \code{\link{quantileplot}}.
+#' @param formula A bivariate model formula (e.g. \code{y ~ x})
+#' @param second_formula Model formula to allow the learning rate to change as a function of the predictor. This is passed to \code{mqgam} as the second element in the \code{form} argument. Defaults to the same specification as \code{formula} but without the outcome variable.
+#' @param data Data frame containing the variables in \code{formula}. If \code{weights} are specified, they must be a column of \code{data}.
+#' @param weights String name for sampling weights, which are a column of \code{data}. If not given, a simple random sample is assumed.
+#' @param quantiles Numeric vector containing quantiles to be estimated. Values should be between 0 and 1.
+#' @param show_ci Logical, defaults to \code{FALSE}. Whether to show credible intervals for the estimated smooth quantile curves.
+#' @param ci Numeric probability value for credible intervals; default to 0.95 to produce 95 percent credible intervals. Only relevant if \code{show_ci = TRUE}.
+#' @param uncertainty_draws Numeric. If non-null, the number of simulated posterior draws to estimate for each smooth quantile curve. When used with the \code{plot} function, these appear in panels below the main plot.
+#' @param ... Other arguments passed to \code{mqgam} for fitting of smooth quantile curves.
+#' @return A list of length 2. Element \code{curves} is a data frame containing the data for plotting smooth curves for quantiles of the outcome given the predictor. Element \code{mqgam.out} is the fitted object from \code{mqgam}.
+#' @references Lee, Robin C., Ian Lundberg, and Brandon M. Stewart. 2021. "Smooth quantile visualizations enhance understanding of bivariate population distributions." Working paper.
+#' @references Lundberg, Ian, and Brandon M. Stewart. 2020. "Comment: Summarizing income mobility with multiple smooth quantiles instead of parameterized means." Sociological Methodology 50(1):96-111.
+#' @references Fasiolo, Matteo, Simon N. Wood, Margaux Zaffran, Raphaël Nedellec, and Yannig Goude. 2020. "Fast calibrated additive quantile regression." Journal of the American Statistical Association.
+#'
+#' @export
+#' @import dplyr
+#' @import ggplot2
+#' @import rlang
+#' @importFrom reshape2 melt
+#' @importFrom stats predict qnorm
+#' @importFrom qgam mqgam
+#' @importFrom mgcv PredictMat
+#' @importFrom mvtnorm rmvnorm
+gen_curves <- function(formula,
+                       second_formula,
+                       data,
+                       weights = NULL,
+                       quantiles = c(.1, .25, .5, .75, .9),
+                       show_ci = FALSE,
+                       ci = 0.95,
+                       uncertainty_draws = 10,
+                       ...) {
+  # Initialize objects that will be called by non-standard evaluation.
+  i <- curve <- x <- estimate <- se <- NULL
+
+  # Extract the x variable name from the formula
+  x_str <- all.vars(formula)[2]
+
+  # See if any smooth terms used in the formula
+  contains_smooth_terms <- any(grepl("s[(]",as.character(formula)))
+
+  # Estimate the smooth quantile curves
+  if (is.null(weights) & contains_smooth_terms) {
+    mqgam.out <- qgam::mqgam(list(formula, second_formula),
+                             data = data, qu = quantiles,
+                             ...)
+  } else if (!is.null(weights) & contains_smooth_terms) {
+    mqgam.out <- qgam::mqgam(list(formula, second_formula),
+                             data = data, qu = quantiles,
+                             argGam = list(weights = data[[weights]]),
+                             ...)
+  } else if (is.null(weights) & !contains_smooth_terms) {
+    mqgam.out <- qgam::mqgam(formula,
+                             data = data, qu = quantiles,
+                             ...)
+  } else if (!is.null(weights) & !contains_smooth_terms) {
+    mqgam.out <- qgam::mqgam(formula,
+                             data = data, qu = quantiles,
+                             argGam = list(weights = data[[weights]]),
+                             ...)
+  }
+
+  # Split the x range into 200 points where we will make predictions
+  to_predict <- data.frame(x = seq(min(data %>% group_by() %>% dplyr::select_at(x_str)),
+                                   max(data %>% group_by() %>% dplyr::select_at(x_str)),
+                                   length.out = 200)) %>%
+    rename_with(.fn = function(x) x_str)
+
+  # Generate point estimate and simulated curves
+  quantile_curves_list <- lapply(quantiles, function(q) {
+    quantile_index <- which(quantiles == q)
+    # Get the model matrix on the expanded basis
+    if (!contains_smooth_terms) {
+      X_basis_expansion <- mgcv::predict.gam(mqgam.out$fit[[1]],
+                                             type = "lpmatrix",
+                                             newdata = to_predict)
+    } else {
+      # For smooth terms, need a different function to predict the model matrix
+      # cbind adds the intercept to the smooth
+      X_basis_expansion <- cbind(1,mgcv::PredictMat(mqgam.out$smooth[[1]],
+                                                    data = to_predict))
+    }
+    # Extract the fitted coefficients beta
+    beta <- mqgam.out$fit[[quantile_index]]$coefficients
+    # Extract the variance-covariance matrix Sigma
+    rV <- mqgam.out$fit[[quantile_index]]$rV
+    sig2 <- mqgam.out$fit[[quantile_index]]$sig2
+    Sigma <- rV %*% t(rV) * sig2
+
+    # Calculate the point estimate and standard error
+    yhat_point <- X_basis_expansion %*% beta
+    yhat_se <- sqrt(diag(X_basis_expansion %*% Sigma %*% t(X_basis_expansion)))
+    # Create a data frame of the point estimate
+    point_df <- data.frame(curve = "point",
+                           x = to_predict[[1]],
+                           estimate = yhat_point,
+                           se = yhat_se)
+
+    # Calculate uncertainty_draws simulated curves
+    if (!is.null(uncertainty_draws)) {
+      beta_star <- t(mvtnorm::rmvnorm(uncertainty_draws, mean = beta, sigma = Sigma))
+      yhat_star <- X_basis_expansion %*% beta_star
+      # Create a data frame of the simulated curves
+      sim_curves_df <- data.frame(yhat_star) %>%
+        dplyr::rename_all(.funs = function(x) gsub("X","sim",x)) %>%
+        dplyr::mutate(x = to_predict[[1]]) %>%
+        reshape2::melt(id = "x", variable.name = "curve", value.name = "estimate") %>%
+        dplyr::mutate(se = NA) %>%
+        dplyr::select(curve, x, estimate, se)
+    }
+    to_return <- point_df
+    if (!is.null(uncertainty_draws)) {
+      to_return <- to_return  %>%
+        dplyr::bind_rows(sim_curves_df)
+    }
+    to_return <- to_return %>%
+      dplyr::mutate(ci.min = estimate - stats::qnorm((1 + ci) / 2) * se,
+                    ci.max = estimate + stats::qnorm((1 + ci) / 2) * se,
+                    Form = "Smooth",
+                    Summary = "quantiles",
+                    percentile = paste0(100 * q,"th"),
+                    percentile_num = 100 * q)
+
+    return(to_return)
+  })
+  # Convert the result to a data frame
+  quantile_curves_df <- do.call(rbind, quantile_curves_list)
+
+  return(list(curves = quantile_curves_df,
+              mqgam.out = mqgam.out))
+}
diff --git a/R/gen_densities.R b/R/gen_densities.R
@@ -0,0 +1,109 @@
+#' Generate Density Estimates for a \code{quantileplot}
+#'
+#' @description Estimate the conditional density of the outcome at at particular values of the predictor. These conditional densities appear as vertical slices in the output of \code{\link{quantileplot}}. This function is typically called indirectly via a user call to \code{\link{quantileplot}}.
+#'
+#' @param data Data frame containing columns \code{x}, \code{y}, and \code{weight}. For a simple random sample, set \code{weight} to be constant.
+#' @param slice_n Integer number of conditional densities to be plotted for \code{y} given \code{x}. These appear in the visualization as vertical slices. Default is 7.
+#' @param x_range Numeric vector of length 2 containing the range of x-axis to be plotted. Defaults to the range of the predictor variable in \code{data}. You may want to specify a narrower range if the predictor is extremely skewed.
+#' @param y_range Numeric vector of length 2 containing the range of y-axis to be plotted. Defaults to the range of the outcome variable in \code{data}. You may want to specify a narrower range if the outcome is extremely skewed.
+#' @param x_bw Numeric bandwidth for density estimation in the \code{x} dimension. The standard deviation of a Gaussian kernel. If \code{NULL}, this is set by the defaults in \code{stats::density()}.
+#' @param y_bw Numeric bandwidth for density estimation in the \code{y} dimension. The standard deviation of a Gaussian kernel. If \code{NULL}, this is set by the defaults in \code{stats::density()}.
+#' @param granularity Integer number of points at which to evaluate each density. Defaults to 512, as in \code{stats::density()}. Higher values yield more granular density estimates.
+#'
+#' @return List containing \code{marginal} and \code{conditional}, each of which is a data frame with density estimates.
+#'
+#' @references Lee, Robin C., Ian Lundberg, and Brandon M. Stewart. 2021. "Smooth quantile visualizations enhance understanding of bivariate population distributions." Working paper.
+#' @references Lundberg, Ian, and Brandon M. Stewart. 2020. "Comment: Summarizing income mobility with multiple smooth quantiles instead of parameterized means." Sociological Methodology 50(1):96-111.
+#' @references Fasiolo, Matteo, Simon N. Wood, Margaux Zaffran, Raphaël Nedellec, and Yannig Goude. 2020. "Fast calibrated additive quantile regression." Journal of the American Statistical Association.
+#'
+#' @export
+#' @import dplyr
+#' @importFrom stats density dnorm weighted.mean
+gen_densities <- function(data, slice_n = 7, x_range = NULL, y_range = NULL, x_bw = NULL, y_bw = NULL, granularity = 512) {
+  if (!all(c("x","y","weight") %in% colnames(data))) {
+    stop("The data object passed to gen_densities must have these columns: x, y, weight")
+  }
+  if (any(is.na(data$x))) {
+    stop("In the data object passed to gen_densities, x has missing values")
+  }
+  if (any(is.na(data$y))) {
+    stop("In the data object passed to gen_densities, y has missing values")
+  }
+  if (any(is.na(data$weight))) {
+    stop("In the data object passed to gen_densities, weight has missing values")
+  }
+  # Normalize the weight as requested by stats::density()
+  data$weight <- data$weight / sum(data$weight)
+  # If bandwidths were not specified by the user, set them by the default from stats::density()
+  if (is.null(x_bw) | is.null(y_bw)) {
+    cat("Bandwidths are being set at:\n")
+  }
+  if (is.null(x_bw)) {
+    x_bw <- stats::density(data$x, weights = data$weight, kernel = "gaussian")$bw
+    cat(paste("x_bw =",x_bw,"\n"))
+  }
+  if (is.null(y_bw)) {
+    y_bw <- stats::density(data$y, weights = data$weight, kernel = "gaussian")$bw
+    cat(paste("y_bw =",y_bw,"\n"))
+  }
+  # If x_range or y_range is null, replace with the range of the data
+  if (is.null(x_range)) {
+    x_range <- range(data$x)
+  }
+  if (is.null(y_range)) {
+    y_range = range(data$y)
+  }
+
+  # Define the grid of values at which to estimate densities.
+  # Spread the grid evenly over the range, excluding the endpoints.
+  marginal_x_seq <- seq(x_range[1], x_range[2], length.out = granularity + 2)[2:(granularity + 1)]
+  conditional_x_seq <- seq(x_range[1], x_range[2], length.out = slice_n + 2)[2:(slice_n + 1)]
+  conditional_y_seq <- seq(y_range[1], y_range[2], length.out = granularity + 2)[2:(granularity + 1)]
+
+  # Estimate the marginal density
+  marginal_density <- data.frame(x = rep(NA, granularity),
+                                 density = rep(NA, granularity))
+  for (i in 1:granularity) {
+    x_eval <- marginal_x_seq[i]
+    contribution_each_observation <- 1 / x_bw * stats::dnorm((x_eval - data$x) / x_bw)
+    estimate <- weighted.mean(contribution_each_observation,
+                              w = data$weight)
+    # Place those in the marignal_density data frame
+    marginal_density$x[i] <- x_eval
+    marginal_density$density[i] <- estimate
+  }
+
+  # Estimate the conditional densities
+  conditional_density <- data.frame(x = rep(conditional_x_seq, each = granularity),
+                                    y = rep(conditional_y_seq, slice_n),
+                                    density = NA)
+  for (i in 1:nrow(conditional_density)) {
+    # For easier reference, extract this (x,y) for evaluation
+    x_eval <- conditional_density$x[i]
+    y_eval <- conditional_density$y[i]
+
+    # Estimate contribution of each data point to the marginal density at (x_eval)
+    m_each <- 1 / x_bw * stats::dnorm((x_eval - data$x) / x_bw)
+    # Average those to produce the marginal density estimate at (x_eval)
+    # Do the weighted mean manually to reduce risk of underflow problems in large samples.
+    #m <- stats::weighted.mean(m_each, w = data$weight)
+    m <- sum(m_each * data$weight) / sum(data$weight)
+
+    # Estimate contribution of each data point to the conditional density at (x_eval,y_eval)
+    # using the product kernel.
+    # Divide by m internal to the weighted mean rather than external so that the internal
+    # product is not so small.
+    # This is similar to Hyndman et al. 1996 eq. 2.4-2.5.
+    g_each <- (m_each / m) * 1 / y_bw * stats::dnorm((y_eval - data$y) / y_bw)
+    # Average those to produce the marginal density estimate at (x_eval, y_eval)
+    g <- sum(g_each * data$weight) / sum(data$weight)
+
+    # Place that in the data frame
+    conditional_density$density[i] <- g
+  }
+  return(list(marginal = marginal_density,
+              conditional = conditional_density,
+              x_bw = x_bw,
+              y_bw = y_bw))
+}
+
diff --git a/R/plot.quantileplot.R b/R/plot.quantileplot.R
@@ -0,0 +1,38 @@
+#' Plot function for \code{quantileplot} objects
+#' @description Plots a \code{quantileplot} object.
+#' @param x An object of class \code{quantileplot}, which results from a call to the function \code{quantileplot}
+#' @param bottom_xlab_angle Numeric degrees to rotate x-axis labels
+#' @param bottom_height Proportion of vertical space in the full plot to be taken up by uncertainty draws at the bottom (only relevant if the \code{quantileplot} includes uncertainty draws).
+#' @param ... Other arguments to \code{plot} commands
+#' @return Prints a plot. If the \code{quantileplot} was specified with option \code{uncertainty_draws = TRUE}, this is a \code{gtable} object with a panel of plots. If the \code{quantileplot} was specified with \code{uncertainty_draws = FALSE} (the default), this is a \code{ggplot2} object.
+#' @references Lee, Robin C., Ian Lundberg, and Brandon M. Stewart. 2021. "Smooth quantile visualizations enhance understanding of bivariate population distributions." Working paper.
+#' @references Lundberg, Ian, and Brandon M. Stewart. 2020. "Comment: Summarizing income mobility with multiple smooth quantiles instead of parameterized means." Sociological Methodology 50(1):96-111.
+#' @references Fasiolo, Matteo, Simon N. Wood, Margaux Zaffran, Raphaël Nedellec, and Yannig Goude. 2020. "Fast calibrated additive quantile regression." Journal of the American Statistical Association.
+#' @export
+
+plot.quantileplot <- function(x, bottom_xlab_angle = NULL, bottom_height = .3, ...) {
+  if (is.null(x$arguments$uncertainty_draws)) {
+    print(x$plot)
+  } else {
+    if (!is.numeric(bottom_height)) {
+      stop("Argument error: Bottom height should be between 0 and 1 (a proportion of total plot height)")
+    }
+    if (bottom_height < 0 | bottom_height > 1) {
+      stop("Argument error: Bottom height should be between 0 and 1 (a proportion of total plot height)")
+    }
+    all_plots <- x$sim_curve_plots
+    if (!is.null(bottom_xlab_angle)) {
+      for (i in 1:length(all_plots)) {
+        all_plots[[i]] <- all_plots[[i]] +
+          theme(axis.text.x = element_text(angle = bottom_xlab_angle, hjust = 1))
+      }
+    }
+    all_plots$point <- x$plot
+    with_uncertainty_draws <- gridExtra::grid.arrange(grobs = all_plots,
+                                                      heights = c(1 - bottom_height, bottom_height),
+                                                      layout_matrix = rbind(length(x$sim_curve_plots) + 1,
+                                                                            1:length(x$sim_curve_plots)))
+
+    print(with_uncertainty_draws)
+  }
+}
diff --git a/R/print.quantileplot.R b/R/print.quantileplot.R
@@ -0,0 +1,22 @@
+#' Print function for \code{quantileplot} objects
+#' @description Prints the plot generated by \code{quantileplot}
+#' @param x An object of class \code{quantileplot}, which results from a call to the function \code{quantileplot}
+#' @param ... Other arguments to \code{print} commands
+#' @return Prints the plot generated by \code{quantileplot}
+#' @references Lee, Robin C., Ian Lundberg, and Brandon M. Stewart. 2021. "Smooth quantile visualizations enhance understanding of bivariate population distributions." Working paper.
+#' @references Lundberg, Ian, and Brandon M. Stewart. 2020. "Comment: Summarizing income mobility with multiple smooth quantiles instead of parameterized means." Sociological Methodology 50(1):96-111.
+#' @references Fasiolo, Matteo, Simon N. Wood, Margaux Zaffran, Raphaël Nedellec, and Yannig Goude. 2020. "Fast calibrated additive quantile regression." Journal of the American Statistical Association.
+#' @export
+
+print.quantileplot <- function(x, ...) {
+
+  # If there is a lack of convergence, warn the user.
+  convergence <- sapply(x$mqgam.out$fit, function(each_fit) each_fit$converged)
+  if (any(!convergence)) {
+    print("Warning: Convergence not achieved for the following quantile curves.")
+    print(names(convergence[!convergence]))
+  }
+
+  # Print the plot
+  plot(x)
+}