diff --git a/DESCRIPTION b/DESCRIPTION index 72db09727..8fa08278d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,6 @@ Package: aqp -Version: 1.9.5 -Date: 2015-12-31 -Packaged: 2015-12-31; dylan +Version: 1.9.6 +Date: 2016-02-23 Title: Algorithms for Quantitative Pedology Author: Dylan Beaudette , Pierre Roudier Maintainer: Dylan Beaudette diff --git a/NEWS b/NEWS index 95248f7d0..d299653e1 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +-------------------------- aqp 1.9.6 (2016-02-23) -------------------------- +* minor changes to profile_compare(), getting ready for a complete overhaul to this function +* new function evalMissingData(), see manual page + -------------------------- aqp 1.9.5 (2015-12-28) -------------------------- * documentation for new transition probability functions, see ?hzTransitionProbabilities for details diff --git a/R/evalMissingData.R b/R/evalMissingData.R new file mode 100644 index 000000000..dbe8d53c5 --- /dev/null +++ b/R/evalMissingData.R @@ -0,0 +1,40 @@ + +## TODO: this needs checking / documentation + +## missing data fraction computed on a thickness basis +## this is meant to be run on a single profile at a time +# x: single SPC +# v: variables to consider +# n: horizon designations +# p: inverted pattern matching non-soil horizons +.getMissingDataFraction <- function(x, v, n, p) { + # get horizons + h <- horizons(x) + # get top/bottom + dc <- horizonDepths(x) + # data fraction based on thickness + hz.thick <- h[[dc[2]]] - h[[dc[1]]] + + # get all "soil" horizons, for named variables + soil.hz <- grep(p, h[[n]], ignore.case=TRUE, invert=TRUE) + d <- h[soil.hz, v] + hz.thick <- hz.thick[soil.hz] + hz.with.data <- which(complete.cases(d)) + # compute fraction of data based on thickness + res <- sum(hz.thick[hz.with.data], na.rm=TRUE) / sum(hz.thick, na.rm = TRUE) + + return(res) +} + + +## iterate over profiles +# x: SPC with >=1 profiles +# vars: variables to consider +# name: horizon designations +# p: inverted pattern matching non-soil horizons +evalMissingData <- function(x, vars, name='hzname', p='Cr|R|Cd') { + # apply missing data fraction eval to each profile + md <- profileApply(x, .getMissingDataFraction, v=vars, n=name, p=p) + return(md) +} + diff --git a/R/profile_compare.R b/R/profile_compare.R index 52d458b57..2a3a36229 100644 --- a/R/profile_compare.R +++ b/R/profile_compare.R @@ -24,13 +24,15 @@ ## TODO: determine practical significance of 'filter' argument + ## low-level function that the user will probably not ever use directly # Seems to scale to 1000 profiles with 5 variables, could use optimization # function requires at least two attributes # hard coded reference to s$id # set k to 0 for no depth weighting -pc <- function(s, vars, max_d, k, filter=NULL, sample_interval=NA, replace_na=TRUE, add_soil_flag=TRUE, return_depth_distances=FALSE, strict_hz_eval=FALSE, progress='none', plot.depth.matrix=FALSE, -rescale.result=FALSE, verbose=FALSE) { +pc <- function(s, vars, max_d, k, filter=NULL, sample_interval=NA, replace_na=TRUE, +add_soil_flag=TRUE, return_depth_distances=FALSE, strict_hz_eval=FALSE, progress='none', +plot.depth.matrix=FALSE, rescale.result=FALSE, verbose=FALSE) { # currently this will only work with integer depths # test by attempting to cast to integers @@ -51,33 +53,6 @@ rescale.result=FALSE, verbose=FALSE) { s <- s[filter, ] } - ## TODO: put this into its own function - ## TODO: weight result using horizon thickness - # iterate over profiles and compute percent missing data by variable - pct_missing <- ddply(s, 'id', .fun=function(i, v=vars) { - ## TODO: rows.to.review may not be needed - # only evaluate missing data within horizons that aren't completly NA (Oi, Cr, R horizons) - # all.missing.test <- apply(sapply(i[, v], is.na), 1, all) - # rows.to.review <- which( ! all.missing.test ) - round(sapply(i[, v], function(j) length(which(is.na(j)))) / nrow(i) * 100) - }) - - # keep track of profiles missing any or all of their data - problem.profiles.idx <- which(apply(pct_missing[, vars], 1, function(i) any(i > 0)) ) - bad.profiles.idx <- which(apply(pct_missing[, vars], 1, function(i) all(i == 100)) ) - - if(length(problem.profiles.idx) > 0) { - assign('problem.profiles', value=pct_missing[problem.profiles.idx,], envir=aqp.env) - message('Missing data will bias results, check inputs.\nPercent missing:') - print(pct_missing[problem.profiles.idx, ]) - } - - if(length(bad.profiles.idx) > 0) { - # stop stop and let the user know - bad.profiles <- unique(s$id)[bad.profiles.idx] - stop(paste('no non-NA values associated with profiles:', paste(bad.profiles, collapse=', '), '\nConsider removing these profiles and re-running.'), call.=FALSE) - } - # identify the number of profiles # n.profiles <- length(s) n.profiles <- length(unique(s$id)) @@ -316,6 +291,29 @@ rescale.result=FALSE, verbose=FALSE) { pc.SPC <- function(s, vars, rescale.result=FALSE, ...){ # default behavior: do not normalize D + ## 2016-02-22: check for missing data moved from pc() to here + ## TODO: this makes an assumption on the column containing horizon designations + # iterate over profiles and compute percent missing data by variable + pct_data <- evalMissingData(s, vars) + + ## TODO: review this, and make optional via argument + # keep track of profiles missing any or all of their data + problem.profiles.idx <- which(pct_data < 1) + # bad.profiles.idx <- which(pct_data == 0) + + if(length(problem.profiles.idx) > 0) { + # assign('problem.profiles', value=pct_missing[problem.profiles.idx,], envir=aqp.env) + message('Missing data will bias results, check inputs.') + } + + + ## 2016-02-22: disabled for now +# if(length(bad.profiles.idx) > 0) { +# # stop stop and let the user know +# bad.profiles <- profile_id(s)[bad.profiles.idx] +# stop(paste('no non-NA values associated with profiles:', paste(bad.profiles, collapse=', '), '\nConsider removing these profiles and re-running.'), call.=FALSE) +# } + # extract horizons s.hz <- horizons(s) diff --git a/man/evalMissingData.Rd b/man/evalMissingData.Rd new file mode 100644 index 000000000..f17a9f866 --- /dev/null +++ b/man/evalMissingData.Rd @@ -0,0 +1,41 @@ +\name{evalMissingData} +\alias{evalMissingData} + +\title{Evaluate Missing Data} +\description{Evaluate missing data in a SoilProfileCollection object} +\usage{ +evalMissingData(x, vars, name = "hzname", p = "Cr|R|Cd") +} + +\arguments{ + \item{x}{a \code{SoilProfileCollection} object} + \item{vars}{a chatacter vector naming horizon-level attributes in \code{x}} + \item{name}{the name of a horizon-level attribute where horizon designations are stored} + \item{p}{REGEX pattern used to match non-soil horizons} +} + +\details{Data completeness is evaluated by profile, based on the thickness of horizons with complete horizon-level attribute values (specified in \code{vars}) divided by the total thickness. The default REGEX pattern, \code{p}, should catch most non-soil horizons which are excluded from the evaluation.} + +\value{A vector values ranging from 0 to 1, representing the percentage of non-NA data (as specified in \code{vars}) for each profile.} + +\author{D.E. Beaudette} + + +\examples{ +# example data +data(sp2) +# init SPC object +depths(sp2) <- id ~ top + bottom +# compute data completeness +sp2$data.complete <- evalMissingData(sp2, vars = c('r', 'g', 'b'), name = 'name') +# rank +new.order <- order(sp2$data.complete) +# plot along data completeness ranking +plot(sp2, plot.order=new.order) +# add axis, note re-ordering of axis labels +axis(side=1, at=1:length(sp2), labels = round(sp2$data.complete[new.order], 2), +line=-2, cex.axis=0.75) +} + +\keyword{manip} +