-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfill_missing_values.Rd
86 lines (69 loc) · 2.86 KB
/
fill_missing_values.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fill_missing_values.R
\name{fill_missing_values}
\alias{fill_missing_values}
\title{Fill missing values in a data frame}
\usage{
fill_missing_values(
df,
selected_variables = NULL,
method = c("mean", "min", "max", "median", "harmonic", "geometric")
)
}
\arguments{
\item{df}{A dataframe to process for missing value imputation.}
\item{selected_variables}{An optional vector of variable names within \code{df}
for which missing values should be imputed. If \code{NULL} (default), imputation
is applied to all variables in the data frame. Variables must be quoted.}
\item{method}{A character string specifying the imputation method for
continuous variables. Supported methods are \code{"min"}, \code{"max"}, \code{"mean"},
\code{"median"}, \code{"harmonic"}, and \code{"geometric"}. The default method is \code{"mean"}.
For categorical variables, the \code{mode} is always used.}
}
\value{
A data frame with missing values imputed according to the specified
\code{method}.
}
\description{
\code{fill_missing_values()} is an efficient function that addresses missing
values in a data frame. It uses imputation by function, also known as
column-based imputation, to impute the missing values. For continuous
variables, it supports various methods of imputation, including minimum,
maximum, mean, median, harmonic mean, and geometric mean. For categorical
variables, missing values are replaced with the mode of the column. This
approach ensures accurate and consistent replacements derived from individual
columns, resulting in a complete and reliable dataset for improved analysis
and decision-making.
}
\examples{
library(dplyr)
# Assuming 'df' is the dataframe you want to process
df <- tibble::tibble(
Sepal_Length = c(5.2, 5, 5.7, NA, 6.2, 6.7, 5.5),
Petal_Length = c(1.5, 1.4, 4.2, 1.4, NA, 5.8, 3.7),
Petal_Width = c(NA, 0.2, 1.2, 0.2, 1.3, 1.8, NA),
Species = c("setosa", NA, "versicolor", "setosa",
NA, "virginica", "setosa")
)
# Impute using the mean method for continuous variables
result_df_mean <- fill_missing_values(df, method = "mean")
result_df_mean
# Impute using the geometric mean for continuous variables and specify
# variables `Petal_Length` and `Petal_Width`.
result_df_geomean <- fill_missing_values(df, selected_variables = c
("Petal_Length", "Petal_Width"), method = "geometric")
result_df_geomean
# Impute missing values (NAs) in a grouped data frame
# You can do that by using the following:
sample_iris <- tibble::tibble(
Sepal_Length = c(5.2, 5, 5.7, NA, 6.2, 6.7, 5.5),
Petal_Length = c(1.5, 1.4, 4.2, 1.4, NA, 5.8, 3.7),
Petal_Width = c(0.3, 0.2, 1.2, 0.2, 1.3, 1.8, NA),
Species = c("setosa", "setosa", "versicolor", "setosa",
"virginica", "virginica", "setosa")
)
sample_iris \%>\%
group_by(Species) \%>\%
group_split() \%>\%
map_df(fill_missing_values, method = "median")
}