new function, minimally tested changes to profile_compare

mdstephenserdc · Feb 23, 2016 · 4fe1261 · 4fe1261
1 parent c727a90
commit 4fe1261
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 32 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,6 @@
 Package: aqp
-Version: 1.9.5
-Date: 2015-12-31
-Packaged: 2015-12-31; dylan
+Version: 1.9.6
+Date: 2016-02-23
 Title: Algorithms for Quantitative Pedology
 Author: Dylan Beaudette <[email protected]>, Pierre Roudier <[email protected]>
 Maintainer: Dylan Beaudette <[email protected]>

diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+-------------------------- aqp 1.9.6 (2016-02-23) --------------------------
+* minor changes to profile_compare(), getting ready for a complete overhaul to this function
+* new function evalMissingData(), see manual page
+
 -------------------------- aqp 1.9.5 (2015-12-28) --------------------------
 * documentation for new transition probability functions, see ?hzTransitionProbabilities for details
 

diff --git a/R/evalMissingData.R b/R/evalMissingData.R
@@ -0,0 +1,40 @@
+
+## TODO: this needs checking / documentation
+
+## missing data fraction computed on a thickness basis
+## this is meant to be run on a single profile at a time
+# x: single SPC
+# v: variables to consider
+# n: horizon designations
+# p: inverted pattern matching non-soil horizons
+.getMissingDataFraction <- function(x, v, n, p) {
+  # get horizons
+  h <- horizons(x)
+  # get top/bottom
+  dc <- horizonDepths(x)
+  # data fraction based on thickness
+  hz.thick <- h[[dc[2]]] - h[[dc[1]]]
+
+  # get all "soil" horizons, for named variables
+  soil.hz <- grep(p, h[[n]], ignore.case=TRUE, invert=TRUE)
+  d <- h[soil.hz, v]
+  hz.thick <- hz.thick[soil.hz]
+  hz.with.data <- which(complete.cases(d))
+  # compute fraction of data based on thickness
+  res <- sum(hz.thick[hz.with.data], na.rm=TRUE) / sum(hz.thick, na.rm = TRUE)
+
+  return(res)
+}
+
+
+## iterate over profiles
+# x: SPC with >=1 profiles
+# vars: variables to consider
+# name: horizon designations
+# p: inverted pattern matching non-soil horizons
+evalMissingData <- function(x, vars, name='hzname', p='Cr|R|Cd') {
+  # apply missing data fraction eval to each profile
+  md <- profileApply(x, .getMissingDataFraction, v=vars, n=name, p=p)
+  return(md)
+}
+
diff --git a/R/profile_compare.R b/R/profile_compare.R
@@ -24,13 +24,15 @@
 
 ## TODO: determine practical significance of 'filter' argument
 
+
 ## low-level function that the user will probably not ever use directly
 # Seems to scale to 1000 profiles with 5 variables, could use optimization
 # function requires at least two attributes
 # hard coded reference to s$id
 # set k to 0 for no depth weighting 
-pc <- function(s, vars, max_d, k, filter=NULL, sample_interval=NA, replace_na=TRUE, add_soil_flag=TRUE, return_depth_distances=FALSE, strict_hz_eval=FALSE, progress='none', plot.depth.matrix=FALSE, 
-rescale.result=FALSE, verbose=FALSE) {
+pc <- function(s, vars, max_d, k, filter=NULL, sample_interval=NA, replace_na=TRUE, 
+add_soil_flag=TRUE, return_depth_distances=FALSE, strict_hz_eval=FALSE, progress='none', 
+plot.depth.matrix=FALSE, rescale.result=FALSE, verbose=FALSE) {
 
 	# currently this will only work with integer depths
 	# test by attempting to cast to integers
@@ -51,33 +53,6 @@ rescale.result=FALSE, verbose=FALSE) {
 		s <- s[filter, ]
 	}
 
-	## TODO: put this into its own function
-	## TODO: weight result using horizon thickness
-	# iterate over profiles and compute percent missing data by variable
-	pct_missing <- ddply(s, 'id', .fun=function(i, v=vars) {
-		## TODO: rows.to.review may not be needed
-		# only evaluate missing data within horizons that aren't completly NA (Oi, Cr, R horizons)
-		# all.missing.test <- apply(sapply(i[, v], is.na), 1, all)
-		# rows.to.review <- which( ! all.missing.test )
-		round(sapply(i[, v], function(j) length(which(is.na(j)))) / nrow(i) * 100)
-	})
-
-	# keep track of profiles missing any or all of their data
-	problem.profiles.idx <- which(apply(pct_missing[, vars], 1, function(i) any(i > 0)) )
-	bad.profiles.idx <- which(apply(pct_missing[, vars], 1, function(i) all(i == 100)) )
-
-	if(length(problem.profiles.idx) > 0) {
-	  assign('problem.profiles', value=pct_missing[problem.profiles.idx,], envir=aqp.env)
-		message('Missing data will bias results, check inputs.\nPercent missing:')
-		print(pct_missing[problem.profiles.idx, ])
-	}
-
-	if(length(bad.profiles.idx) > 0) {
-	  # stop stop and let the user know
-	  bad.profiles <- unique(s$id)[bad.profiles.idx]
-	  stop(paste('no non-NA values associated with profiles:', paste(bad.profiles, collapse=', '), '\nConsider removing these profiles and re-running.'), call.=FALSE)
-	}
-
 	# identify the number of profiles
   # n.profiles <- length(s)
 	n.profiles <- length(unique(s$id))
@@ -316,6 +291,29 @@ rescale.result=FALSE, verbose=FALSE) {
 pc.SPC <- function(s, vars, rescale.result=FALSE, ...){
 	# default behavior: do not normalize D
 
+  ## 2016-02-22: check for missing data moved from pc() to here
+  ## TODO: this makes an assumption on the column containing horizon designations
+  # iterate over profiles and compute percent missing data by variable
+  pct_data <- evalMissingData(s, vars)
+
+  ## TODO: review this, and make optional via argument
+  # keep track of profiles missing any or all of their data
+  problem.profiles.idx <- which(pct_data < 1)
+  # bad.profiles.idx <- which(pct_data == 0)
+
+  if(length(problem.profiles.idx) > 0) {
+    # assign('problem.profiles', value=pct_missing[problem.profiles.idx,], envir=aqp.env)
+    message('Missing data will bias results, check inputs.')
+  }
+
+
+  ## 2016-02-22: disabled for now
+#   if(length(bad.profiles.idx) > 0) {
+#     # stop stop and let the user know
+#     bad.profiles <- profile_id(s)[bad.profiles.idx]
+#     stop(paste('no non-NA values associated with profiles:', paste(bad.profiles, collapse=', '), '\nConsider removing these profiles and re-running.'), call.=FALSE)
+#   }
+
 	# extract horizons
 	s.hz <- horizons(s)
 

diff --git a/man/evalMissingData.Rd b/man/evalMissingData.Rd
@@ -0,0 +1,41 @@
+\name{evalMissingData}
+\alias{evalMissingData}
+
+\title{Evaluate Missing Data}
+\description{Evaluate missing data in a SoilProfileCollection object}
+\usage{
+evalMissingData(x, vars, name = "hzname", p = "Cr|R|Cd")
+}
+
+\arguments{
+  \item{x}{a \code{SoilProfileCollection} object}
+  \item{vars}{a chatacter vector naming horizon-level attributes in \code{x}}
+  \item{name}{the name of a horizon-level attribute where horizon designations are stored}
+  \item{p}{REGEX pattern used to match non-soil horizons}
+}
+
+\details{Data completeness is evaluated by profile, based on the thickness of horizons with complete horizon-level attribute values (specified in \code{vars}) divided by the total thickness. The default REGEX pattern, \code{p}, should catch most non-soil horizons which are excluded from the evaluation.}
+
+\value{A vector values ranging from 0 to 1, representing the percentage of non-NA data (as specified in \code{vars}) for each profile.}
+
+\author{D.E. Beaudette}
+
+
+\examples{
+# example data
+data(sp2)
+# init SPC object
+depths(sp2) <- id ~ top + bottom
+# compute data completeness
+sp2$data.complete <- evalMissingData(sp2, vars = c('r', 'g', 'b'), name = 'name')
+# rank
+new.order <- order(sp2$data.complete)
+# plot along data completeness ranking
+plot(sp2, plot.order=new.order)
+# add axis, note re-ordering of axis labels
+axis(side=1, at=1:length(sp2), labels = round(sp2$data.complete[new.order], 2),
+line=-2, cex.axis=0.75)
+}
+
+\keyword{manip}
+