-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathstat_dens2d_labels.Rd
271 lines (228 loc) · 10.4 KB
/
stat_dens2d_labels.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stat-dens2d-labels.r
\name{stat_dens2d_labels}
\alias{stat_dens2d_labels}
\title{Replace labels in data based on 2D density}
\usage{
stat_dens2d_labels(
mapping = NULL,
data = NULL,
geom = "text",
position = "identity",
...,
keep.fraction = 0.1,
keep.number = Inf,
keep.sparse = TRUE,
keep.these = FALSE,
exclude.these = FALSE,
these.target = "label",
pool.along = c("xy", "x", "y", "none"),
xintercept = 0,
yintercept = 0,
invert.selection = FALSE,
h = NULL,
n = NULL,
label.fill = "",
return.density = FALSE,
na.rm = TRUE,
show.legend = FALSE,
inherit.aes = TRUE
)
}
\arguments{
\item{mapping}{The aesthetic mapping, usually constructed with
\code{\link[ggplot2]{aes}} or \code{\link[ggplot2]{aes_}}. Only needs
to be set at the layer level if you are overriding the plot defaults.}
\item{data}{A layer specific dataset - only needed if you want to override
the plot defaults.}
\item{geom}{The geometric object to use display the data.}
\item{position}{The position adjustment to use for overlapping points on this
layer}
\item{...}{other arguments passed on to \code{\link[ggplot2]{layer}}. This
can include aesthetics whose values you want to set, not map. See
\code{\link[ggplot2]{layer}} for more details.}
\item{keep.fraction}{numeric [0..1]. The fraction of the observations (or
rows) in \code{data} to be retained.}
\item{keep.number}{integer Set the maximum number of observations to retain,
effective only if obeying \code{keep.fraction} would result in a larger
number.}
\item{keep.sparse}{logical If \code{TRUE}, the default, observations from the
more sparse regions are retained, if \code{FALSE} those from the densest
regions.}
\item{keep.these, exclude.these}{character vector, integer vector, logical
vector or function that takes one or more variables in data selected by
\code{these.target}. Negative integers behave as in R's extraction methods.
The rows from \code{data} indicated by \code{keep.these} and
\code{exclude.these} are kept or excluded irrespective of the local
density.}
\item{these.target}{character, numeric or logical selecting one or more
column(s) of \code{data}. If \code{TRUE} the whole \code{data} object is
passed.}
\item{pool.along}{character, one of \code{"none"} or \code{"x"},
indicating if selection should be done pooling the observations along the
\emph{x} aesthetic, or separately on either side of \code{xintercept}.}
\item{xintercept, yintercept}{numeric The split points for the data filtering.}
\item{invert.selection}{logical If \code{TRUE}, the complement of the
selected rows are returned.}
\item{h}{vector of bandwidths for x and y directions. Defaults to normal
reference bandwidth (see bandwidth.nrd). A scalar value will be taken to
apply to both directions.}
\item{n}{Number of grid points in each direction. Can be scalar or a length-2
integer vector}
\item{label.fill}{character vector of length 1, a function or \code{NULL}.}
\item{return.density}{logical vector of lenght 1. If \code{TRUE} add columns
\code{"density"} and \code{"keep.obs"} to the returned data frame.}
\item{na.rm}{a logical value indicating whether NA values should be stripped
before the computation proceeds.}
\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE}
never includes, and \code{TRUE} always includes.}
\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics, rather
than combining with them. This is most useful for helper functions that
define both data and aesthetics and shouldn't inherit behaviour from the
default plot specification, e.g. \code{\link[ggplot2]{borders}}.}
}
\value{
A plot layer instance. Using as output \code{data} the input
\code{data} after value substitution based on a 2D the filtering criterion.
}
\description{
\code{stat_dens2d_labels()} Sets values mapped to the
\code{label} aesthetic to \code{""} or a user provided character string
based on the local density in regions of a plot panel. Its main use is
together with repulsive geoms from package \code{\link[ggrepel]{ggrepel}}.
If there is no mapping to \code{label} in \code{data}, the mapping is set
to \code{rownames(data)}, with a message.
}
\details{
\code{stat_dens2d_labels()} is designed to work together with
geometries from package 'ggrepel'. To avoid text labels being plotted over
unlabelled points all the rows in data need to be retained but
labels replaced with the empty character string, \code{""}. Function
\code{\link{stat_dens2d_filter}} cannot be used with the repulsive geoms
from 'ggrepel' because it drops observations.
\code{stat_dens2d_labels()} can be useful also in other situations, as the
substitution character string can be set by the user by passing an argument
to \code{label.fill}. If this argument is \code{NULL} the unselected rows
are filtered out identically as by \code{stat_dens2d_filter}.
The local density of observations in 2D (\emph{x} and \emph{y}) is computed
with function \code{\link[MASS]{kde2d}} and used to select observations,
passing to the geom all the rows in its \code{data} input but with with the
text of labels replaced in those "not kept". The default is to select
observations in sparse regions of the plot, but the selection can be
inverted so that only observations in the densest regions are returned.
Specific observations can be protected from having the label replaced by
passing a suitable argument to \code{keep.these}. Logical and integer
vectors function as indexes to rows in \code{data}, while a character
vector is compared to values in the variable mapped to the \code{label}
aesthetic. A function passed as argument to \code{keep.these} will receive
as its first argument the values in the variable mapped to \code{label} and
should return a character, logical or numeric vector as described above.
How many labels are retained intact in addition to those in
\code{keep.these} is controlled with arguments passed to \code{keep.number}
and \code{keep.fraction}. \code{keep.number} sets the maximum number of
observations selected, whenever \code{keep.fraction} results in fewer
observations selected, it is obeyed.
Computation of density and of the default bandwidth require at least
two observations with different values. If data do not fulfill this
condition, they are kept only if \code{keep.fraction = 1}. This is correct
behavior for a single observation, but can be surprising in the case of
multiple observations.
Parameters \code{keep.these} and \code{exclude.these} make it possible to
force inclusion or exclusion of observations after the density is computed.
In case of conflict, \code{exclude.these} overrides \code{keep.these}.
}
\note{
Which points are kept and which not depends on how dense a grid is used
and how flexible the density surface estimate is. This depends on the
values passed as arguments to parameters \code{n}, \code{bw} and
\code{kernel}. It is also important to be aware that both
\code{geom_text()} and \code{geom_text_repel()} can avoid overplotting by
discarding labels at the plot rendering stage, i.e., what is plotted may
differ from what is returned by this statistic.
}
\examples{
random_string <-
function(len = 6) {
paste(sample(letters, len, replace = TRUE), collapse = "")
}
# Make random data.
set.seed(1001)
d <- tibble::tibble(
x = rnorm(100),
y = rnorm(100),
group = rep(c("A", "B"), c(50, 50)),
lab = replicate(100, { random_string() })
)
# using defaults
ggplot(data = d, aes(x, y, label = lab)) +
geom_point() +
stat_dens2d_labels()
ggplot(data = d, aes(x, y, label = lab)) +
geom_point() +
stat_dens2d_labels(keep.these = "zoujdg")
ggplot(data = d, aes(x, y, label = lab)) +
geom_point() +
stat_dens2d_labels(keep.these = function(x) {grepl("^z", x)})
ggplot(data = d, aes(x, y, label = lab)) +
geom_point() +
stat_dens2d_labels(geom = "text_s",
position = position_nudge_center(x = 0.1, y = 0.1,
center_x = mean,
center_y = mean),
vjust = "outward_mean", hjust = "outward_mean") +
expand_limits(x = c(-4, 4.5))
ggrepel.installed <- requireNamespace("ggrepel", quietly = TRUE)
if (ggrepel.installed) {
library(ggrepel)
ggplot(data = d, aes(x, y, label = lab, colour = group)) +
geom_point() +
stat_dens2d_labels(geom = "text_repel")
ggplot(data = d, aes(x, y, label = lab, colour = group)) +
geom_point() +
stat_dens2d_labels(geom = "text_repel", label.fill = NA)
# we keep labels starting with "a" across the whole plot, but all in sparse
# regions. To achieve this we pass as argument to label.fill a fucntion
# instead of a character string.
label.fun <- function(x) {ifelse(grepl("^a", x), x, "")}
ggplot(data = d, aes(x, y, label = lab, colour = group)) +
geom_point() +
stat_dens2d_labels(geom = "text_repel", label.fill = label.fun)
}
# Using geom_debug() we can see that all 100 rows in \code{d} are
# returned. But only those labelled in the previous example still contain
# the original labels.
gginnards.installed <- requireNamespace("gginnards", quietly = TRUE)
if (gginnards.installed) {
library(gginnards)
ggplot(data = d, aes(x, y, label = lab)) +
geom_point() +
stat_dens2d_labels(geom = "debug")
ggplot(data = d, aes(x, y, label = lab)) +
geom_point() +
stat_dens2d_labels(geom = "debug", return.density = TRUE)
ggplot(data = d, aes(x, y, label = lab)) +
geom_point() +
stat_dens2d_labels(geom = "debug", label.fill = NULL)
ggplot(data = d, aes(x, y, label = lab)) +
geom_point() +
stat_dens2d_labels(geom = "debug", label.fill = FALSE, return.density = TRUE)
ggplot(data = d, aes(x, y, label = lab)) +
geom_point() +
stat_dens2d_labels(geom = "debug", label.fill = NULL, return.density = TRUE)
ggplot(data = d, aes(x, y)) +
geom_point() +
stat_dens2d_labels(geom = "debug")
}
}
\seealso{
\code{\link{stat_dens2d_filter}} and \code{\link[MASS]{kde2d}} used
internally. Parameters \code{n}, \code{h} in this statistic correspond to
the parameters with the same name in this imported function. Limits are set
to the limits of the plot scales.
Other statistics returning a subset of data:
\code{\link{stat_dens1d_filter}()},
\code{\link{stat_dens1d_labels}()},
\code{\link{stat_dens2d_filter}()}
}
\concept{statistics returning a subset of data}