Skip to content

Commit

Permalink
Initialize quantileplot package
Browse files Browse the repository at this point in the history
  • Loading branch information
Ian Lundberg committed May 3, 2021
0 parents commit 6768e0d
Show file tree
Hide file tree
Showing 24 changed files with 2,036 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
inst/doc
Meta
.Rproj.user
.Rhistory
.RData
.Ruserdata
.DS_Store
29 changes: 29 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Package: quantileplot
Version: 0.1.1
Date: 2021-02-25
Title: Smooth Quantile Visualization Tool
Description: This package creates a bivariate smooth quantile plot. The plot visualizes estimates of the marginal density of X, the conditional density of Y given X at selected values of X, and smooth curves showing trends in quantiles of Y given X as a function of X.
Author: Robin C. Lee, Ian Lundberg, Brandon M. Stewart
Maintainer: Robin C. Lee <[email protected]>
RoxygenNote: 7.1.1
Imports:
dplyr,
foreach,
ggplot2,
rlang,
ggthemes,
qgam,
MASS,
labeling,
stats,
gridExtra,
mgcv,
mvtnorm,
reshape2,
tidyr
License: MIT + file LICENSE
Encoding: UTF-8
Suggests:
rmarkdown,
knitr
VignetteBuilder: knitr
2 changes: 2 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
YEAR: 2021
COPYRIGHT HOLDER: Robin C. Lee
23 changes: 23 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by roxygen2: do not edit by hand

S3method(plot,quantileplot)
S3method(print,quantileplot)
S3method(summary,quantileplot)
export(gen_curves)
export(gen_densities)
export(quantileplot)
import(dplyr)
import(ggplot2)
import(rlang)
importFrom(ggthemes,scale_color_colorblind)
importFrom(gridExtra,grid.arrange)
importFrom(labeling,extended)
importFrom(mgcv,PredictMat)
importFrom(mvtnorm,rmvnorm)
importFrom(qgam,mqgam)
importFrom(reshape2,melt)
importFrom(stats,density)
importFrom(stats,dnorm)
importFrom(stats,predict)
importFrom(stats,qnorm)
importFrom(stats,weighted.mean)
134 changes: 134 additions & 0 deletions R/gen_curves.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#' Generate Smooth Curves for a \code{quantileplot}
#'
#' @description Estimate smooth curves for quantiles of the outcome as a function of the predictor. This function calls \code{mqgam} from the \code{qgam} package. This function is typically called indirectly via a user call to \code{\link{quantileplot}}.
#' @param formula A bivariate model formula (e.g. \code{y ~ x})
#' @param second_formula Model formula to allow the learning rate to change as a function of the predictor. This is passed to \code{mqgam} as the second element in the \code{form} argument. Defaults to the same specification as \code{formula} but without the outcome variable.
#' @param data Data frame containing the variables in \code{formula}. If \code{weights} are specified, they must be a column of \code{data}.
#' @param weights String name for sampling weights, which are a column of \code{data}. If not given, a simple random sample is assumed.
#' @param quantiles Numeric vector containing quantiles to be estimated. Values should be between 0 and 1.
#' @param show_ci Logical, defaults to \code{FALSE}. Whether to show credible intervals for the estimated smooth quantile curves.
#' @param ci Numeric probability value for credible intervals; default to 0.95 to produce 95 percent credible intervals. Only relevant if \code{show_ci = TRUE}.
#' @param uncertainty_draws Numeric. If non-null, the number of simulated posterior draws to estimate for each smooth quantile curve. When used with the \code{plot} function, these appear in panels below the main plot.
#' @param ... Other arguments passed to \code{mqgam} for fitting of smooth quantile curves.
#' @return A list of length 2. Element \code{curves} is a data frame containing the data for plotting smooth curves for quantiles of the outcome given the predictor. Element \code{mqgam.out} is the fitted object from \code{mqgam}.
#' @references Lee, Robin C., Ian Lundberg, and Brandon M. Stewart. 2021. "Smooth quantile visualizations enhance understanding of bivariate population distributions." Working paper.
#' @references Lundberg, Ian, and Brandon M. Stewart. 2020. "Comment: Summarizing income mobility with multiple smooth quantiles instead of parameterized means." Sociological Methodology 50(1):96-111.
#' @references Fasiolo, Matteo, Simon N. Wood, Margaux Zaffran, Raphaël Nedellec, and Yannig Goude. 2020. "Fast calibrated additive quantile regression." Journal of the American Statistical Association.
#'
#' @export
#' @import dplyr
#' @import ggplot2
#' @import rlang
#' @importFrom reshape2 melt
#' @importFrom stats predict qnorm
#' @importFrom qgam mqgam
#' @importFrom mgcv PredictMat
#' @importFrom mvtnorm rmvnorm
gen_curves <- function(formula,
second_formula,
data,
weights = NULL,
quantiles = c(.1, .25, .5, .75, .9),
show_ci = FALSE,
ci = 0.95,
uncertainty_draws = 10,
...) {
# Initialize objects that will be called by non-standard evaluation.
i <- curve <- x <- estimate <- se <- NULL

# Extract the x variable name from the formula
x_str <- all.vars(formula)[2]

# See if any smooth terms used in the formula
contains_smooth_terms <- any(grepl("s[(]",as.character(formula)))

# Estimate the smooth quantile curves
if (is.null(weights) & contains_smooth_terms) {
mqgam.out <- qgam::mqgam(list(formula, second_formula),
data = data, qu = quantiles,
...)
} else if (!is.null(weights) & contains_smooth_terms) {
mqgam.out <- qgam::mqgam(list(formula, second_formula),
data = data, qu = quantiles,
argGam = list(weights = data[[weights]]),
...)
} else if (is.null(weights) & !contains_smooth_terms) {
mqgam.out <- qgam::mqgam(formula,
data = data, qu = quantiles,
...)
} else if (!is.null(weights) & !contains_smooth_terms) {
mqgam.out <- qgam::mqgam(formula,
data = data, qu = quantiles,
argGam = list(weights = data[[weights]]),
...)
}

# Split the x range into 200 points where we will make predictions
to_predict <- data.frame(x = seq(min(data %>% group_by() %>% dplyr::select_at(x_str)),
max(data %>% group_by() %>% dplyr::select_at(x_str)),
length.out = 200)) %>%
rename_with(.fn = function(x) x_str)

# Generate point estimate and simulated curves
quantile_curves_list <- lapply(quantiles, function(q) {
quantile_index <- which(quantiles == q)
# Get the model matrix on the expanded basis
if (!contains_smooth_terms) {
X_basis_expansion <- mgcv::predict.gam(mqgam.out$fit[[1]],
type = "lpmatrix",
newdata = to_predict)
} else {
# For smooth terms, need a different function to predict the model matrix
# cbind adds the intercept to the smooth
X_basis_expansion <- cbind(1,mgcv::PredictMat(mqgam.out$smooth[[1]],
data = to_predict))
}
# Extract the fitted coefficients beta
beta <- mqgam.out$fit[[quantile_index]]$coefficients
# Extract the variance-covariance matrix Sigma
rV <- mqgam.out$fit[[quantile_index]]$rV
sig2 <- mqgam.out$fit[[quantile_index]]$sig2
Sigma <- rV %*% t(rV) * sig2

# Calculate the point estimate and standard error
yhat_point <- X_basis_expansion %*% beta
yhat_se <- sqrt(diag(X_basis_expansion %*% Sigma %*% t(X_basis_expansion)))
# Create a data frame of the point estimate
point_df <- data.frame(curve = "point",
x = to_predict[[1]],
estimate = yhat_point,
se = yhat_se)

# Calculate uncertainty_draws simulated curves
if (!is.null(uncertainty_draws)) {
beta_star <- t(mvtnorm::rmvnorm(uncertainty_draws, mean = beta, sigma = Sigma))
yhat_star <- X_basis_expansion %*% beta_star
# Create a data frame of the simulated curves
sim_curves_df <- data.frame(yhat_star) %>%
dplyr::rename_all(.funs = function(x) gsub("X","sim",x)) %>%
dplyr::mutate(x = to_predict[[1]]) %>%
reshape2::melt(id = "x", variable.name = "curve", value.name = "estimate") %>%
dplyr::mutate(se = NA) %>%
dplyr::select(curve, x, estimate, se)
}
to_return <- point_df
if (!is.null(uncertainty_draws)) {
to_return <- to_return %>%
dplyr::bind_rows(sim_curves_df)
}
to_return <- to_return %>%
dplyr::mutate(ci.min = estimate - stats::qnorm((1 + ci) / 2) * se,
ci.max = estimate + stats::qnorm((1 + ci) / 2) * se,
Form = "Smooth",
Summary = "quantiles",
percentile = paste0(100 * q,"th"),
percentile_num = 100 * q)

return(to_return)
})
# Convert the result to a data frame
quantile_curves_df <- do.call(rbind, quantile_curves_list)

return(list(curves = quantile_curves_df,
mqgam.out = mqgam.out))
}
109 changes: 109 additions & 0 deletions R/gen_densities.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#' Generate Density Estimates for a \code{quantileplot}
#'
#' @description Estimate the conditional density of the outcome at at particular values of the predictor. These conditional densities appear as vertical slices in the output of \code{\link{quantileplot}}. This function is typically called indirectly via a user call to \code{\link{quantileplot}}.
#'
#' @param data Data frame containing columns \code{x}, \code{y}, and \code{weight}. For a simple random sample, set \code{weight} to be constant.
#' @param slice_n Integer number of conditional densities to be plotted for \code{y} given \code{x}. These appear in the visualization as vertical slices. Default is 7.
#' @param x_range Numeric vector of length 2 containing the range of x-axis to be plotted. Defaults to the range of the predictor variable in \code{data}. You may want to specify a narrower range if the predictor is extremely skewed.
#' @param y_range Numeric vector of length 2 containing the range of y-axis to be plotted. Defaults to the range of the outcome variable in \code{data}. You may want to specify a narrower range if the outcome is extremely skewed.
#' @param x_bw Numeric bandwidth for density estimation in the \code{x} dimension. The standard deviation of a Gaussian kernel. If \code{NULL}, this is set by the defaults in \code{stats::density()}.
#' @param y_bw Numeric bandwidth for density estimation in the \code{y} dimension. The standard deviation of a Gaussian kernel. If \code{NULL}, this is set by the defaults in \code{stats::density()}.
#' @param granularity Integer number of points at which to evaluate each density. Defaults to 512, as in \code{stats::density()}. Higher values yield more granular density estimates.
#'
#' @return List containing \code{marginal} and \code{conditional}, each of which is a data frame with density estimates.
#'
#' @references Lee, Robin C., Ian Lundberg, and Brandon M. Stewart. 2021. "Smooth quantile visualizations enhance understanding of bivariate population distributions." Working paper.
#' @references Lundberg, Ian, and Brandon M. Stewart. 2020. "Comment: Summarizing income mobility with multiple smooth quantiles instead of parameterized means." Sociological Methodology 50(1):96-111.
#' @references Fasiolo, Matteo, Simon N. Wood, Margaux Zaffran, Raphaël Nedellec, and Yannig Goude. 2020. "Fast calibrated additive quantile regression." Journal of the American Statistical Association.
#'
#' @export
#' @import dplyr
#' @importFrom stats density dnorm weighted.mean
gen_densities <- function(data, slice_n = 7, x_range = NULL, y_range = NULL, x_bw = NULL, y_bw = NULL, granularity = 512) {
if (!all(c("x","y","weight") %in% colnames(data))) {
stop("The data object passed to gen_densities must have these columns: x, y, weight")
}
if (any(is.na(data$x))) {
stop("In the data object passed to gen_densities, x has missing values")
}
if (any(is.na(data$y))) {
stop("In the data object passed to gen_densities, y has missing values")
}
if (any(is.na(data$weight))) {
stop("In the data object passed to gen_densities, weight has missing values")
}
# Normalize the weight as requested by stats::density()
data$weight <- data$weight / sum(data$weight)
# If bandwidths were not specified by the user, set them by the default from stats::density()
if (is.null(x_bw) | is.null(y_bw)) {
cat("Bandwidths are being set at:\n")
}
if (is.null(x_bw)) {
x_bw <- stats::density(data$x, weights = data$weight, kernel = "gaussian")$bw
cat(paste("x_bw =",x_bw,"\n"))
}
if (is.null(y_bw)) {
y_bw <- stats::density(data$y, weights = data$weight, kernel = "gaussian")$bw
cat(paste("y_bw =",y_bw,"\n"))
}
# If x_range or y_range is null, replace with the range of the data
if (is.null(x_range)) {
x_range <- range(data$x)
}
if (is.null(y_range)) {
y_range = range(data$y)
}

# Define the grid of values at which to estimate densities.
# Spread the grid evenly over the range, excluding the endpoints.
marginal_x_seq <- seq(x_range[1], x_range[2], length.out = granularity + 2)[2:(granularity + 1)]
conditional_x_seq <- seq(x_range[1], x_range[2], length.out = slice_n + 2)[2:(slice_n + 1)]
conditional_y_seq <- seq(y_range[1], y_range[2], length.out = granularity + 2)[2:(granularity + 1)]

# Estimate the marginal density
marginal_density <- data.frame(x = rep(NA, granularity),
density = rep(NA, granularity))
for (i in 1:granularity) {
x_eval <- marginal_x_seq[i]
contribution_each_observation <- 1 / x_bw * stats::dnorm((x_eval - data$x) / x_bw)
estimate <- weighted.mean(contribution_each_observation,
w = data$weight)
# Place those in the marignal_density data frame
marginal_density$x[i] <- x_eval
marginal_density$density[i] <- estimate
}

# Estimate the conditional densities
conditional_density <- data.frame(x = rep(conditional_x_seq, each = granularity),
y = rep(conditional_y_seq, slice_n),
density = NA)
for (i in 1:nrow(conditional_density)) {
# For easier reference, extract this (x,y) for evaluation
x_eval <- conditional_density$x[i]
y_eval <- conditional_density$y[i]

# Estimate contribution of each data point to the marginal density at (x_eval)
m_each <- 1 / x_bw * stats::dnorm((x_eval - data$x) / x_bw)
# Average those to produce the marginal density estimate at (x_eval)
# Do the weighted mean manually to reduce risk of underflow problems in large samples.
#m <- stats::weighted.mean(m_each, w = data$weight)
m <- sum(m_each * data$weight) / sum(data$weight)

# Estimate contribution of each data point to the conditional density at (x_eval,y_eval)
# using the product kernel.
# Divide by m internal to the weighted mean rather than external so that the internal
# product is not so small.
# This is similar to Hyndman et al. 1996 eq. 2.4-2.5.
g_each <- (m_each / m) * 1 / y_bw * stats::dnorm((y_eval - data$y) / y_bw)
# Average those to produce the marginal density estimate at (x_eval, y_eval)
g <- sum(g_each * data$weight) / sum(data$weight)

# Place that in the data frame
conditional_density$density[i] <- g
}
return(list(marginal = marginal_density,
conditional = conditional_density,
x_bw = x_bw,
y_bw = y_bw))
}

38 changes: 38 additions & 0 deletions R/plot.quantileplot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#' Plot function for \code{quantileplot} objects
#' @description Plots a \code{quantileplot} object.
#' @param x An object of class \code{quantileplot}, which results from a call to the function \code{quantileplot}
#' @param bottom_xlab_angle Numeric degrees to rotate x-axis labels
#' @param bottom_height Proportion of vertical space in the full plot to be taken up by uncertainty draws at the bottom (only relevant if the \code{quantileplot} includes uncertainty draws).
#' @param ... Other arguments to \code{plot} commands
#' @return Prints a plot. If the \code{quantileplot} was specified with option \code{uncertainty_draws = TRUE}, this is a \code{gtable} object with a panel of plots. If the \code{quantileplot} was specified with \code{uncertainty_draws = FALSE} (the default), this is a \code{ggplot2} object.
#' @references Lee, Robin C., Ian Lundberg, and Brandon M. Stewart. 2021. "Smooth quantile visualizations enhance understanding of bivariate population distributions." Working paper.
#' @references Lundberg, Ian, and Brandon M. Stewart. 2020. "Comment: Summarizing income mobility with multiple smooth quantiles instead of parameterized means." Sociological Methodology 50(1):96-111.
#' @references Fasiolo, Matteo, Simon N. Wood, Margaux Zaffran, Raphaël Nedellec, and Yannig Goude. 2020. "Fast calibrated additive quantile regression." Journal of the American Statistical Association.
#' @export

plot.quantileplot <- function(x, bottom_xlab_angle = NULL, bottom_height = .3, ...) {
if (is.null(x$arguments$uncertainty_draws)) {
print(x$plot)
} else {
if (!is.numeric(bottom_height)) {
stop("Argument error: Bottom height should be between 0 and 1 (a proportion of total plot height)")
}
if (bottom_height < 0 | bottom_height > 1) {
stop("Argument error: Bottom height should be between 0 and 1 (a proportion of total plot height)")
}
all_plots <- x$sim_curve_plots
if (!is.null(bottom_xlab_angle)) {
for (i in 1:length(all_plots)) {
all_plots[[i]] <- all_plots[[i]] +
theme(axis.text.x = element_text(angle = bottom_xlab_angle, hjust = 1))
}
}
all_plots$point <- x$plot
with_uncertainty_draws <- gridExtra::grid.arrange(grobs = all_plots,
heights = c(1 - bottom_height, bottom_height),
layout_matrix = rbind(length(x$sim_curve_plots) + 1,
1:length(x$sim_curve_plots)))

print(with_uncertainty_draws)
}
}
22 changes: 22 additions & 0 deletions R/print.quantileplot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#' Print function for \code{quantileplot} objects
#' @description Prints the plot generated by \code{quantileplot}
#' @param x An object of class \code{quantileplot}, which results from a call to the function \code{quantileplot}
#' @param ... Other arguments to \code{print} commands
#' @return Prints the plot generated by \code{quantileplot}
#' @references Lee, Robin C., Ian Lundberg, and Brandon M. Stewart. 2021. "Smooth quantile visualizations enhance understanding of bivariate population distributions." Working paper.
#' @references Lundberg, Ian, and Brandon M. Stewart. 2020. "Comment: Summarizing income mobility with multiple smooth quantiles instead of parameterized means." Sociological Methodology 50(1):96-111.
#' @references Fasiolo, Matteo, Simon N. Wood, Margaux Zaffran, Raphaël Nedellec, and Yannig Goude. 2020. "Fast calibrated additive quantile regression." Journal of the American Statistical Association.
#' @export

print.quantileplot <- function(x, ...) {

# If there is a lack of convergence, warn the user.
convergence <- sapply(x$mqgam.out$fit, function(each_fit) each_fit$converged)
if (any(!convergence)) {
print("Warning: Convergence not achieved for the following quantile curves.")
print(names(convergence[!convergence]))
}

# Print the plot
plot(x)
}
Loading

0 comments on commit 6768e0d

Please sign in to comment.