Skip to content

Commit

Permalink
new function, minimally tested changes to profile_compare
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanbeaudette committed Feb 23, 2016
1 parent c727a90 commit 4fe1261
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 32 deletions.
5 changes: 2 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
Package: aqp
Version: 1.9.5
Date: 2015-12-31
Packaged: 2015-12-31; dylan
Version: 1.9.6
Date: 2016-02-23
Title: Algorithms for Quantitative Pedology
Author: Dylan Beaudette <[email protected]>, Pierre Roudier <[email protected]>
Maintainer: Dylan Beaudette <[email protected]>
Expand Down
4 changes: 4 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
-------------------------- aqp 1.9.6 (2016-02-23) --------------------------
* minor changes to profile_compare(), getting ready for a complete overhaul to this function
* new function evalMissingData(), see manual page

-------------------------- aqp 1.9.5 (2015-12-28) --------------------------
* documentation for new transition probability functions, see ?hzTransitionProbabilities for details

Expand Down
40 changes: 40 additions & 0 deletions R/evalMissingData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

## TODO: this needs checking / documentation

## missing data fraction computed on a thickness basis
## this is meant to be run on a single profile at a time
# x: single SPC
# v: variables to consider
# n: horizon designations
# p: inverted pattern matching non-soil horizons
.getMissingDataFraction <- function(x, v, n, p) {
# get horizons
h <- horizons(x)
# get top/bottom
dc <- horizonDepths(x)
# data fraction based on thickness
hz.thick <- h[[dc[2]]] - h[[dc[1]]]

# get all "soil" horizons, for named variables
soil.hz <- grep(p, h[[n]], ignore.case=TRUE, invert=TRUE)
d <- h[soil.hz, v]
hz.thick <- hz.thick[soil.hz]
hz.with.data <- which(complete.cases(d))
# compute fraction of data based on thickness
res <- sum(hz.thick[hz.with.data], na.rm=TRUE) / sum(hz.thick, na.rm = TRUE)

return(res)
}


## iterate over profiles
# x: SPC with >=1 profiles
# vars: variables to consider
# name: horizon designations
# p: inverted pattern matching non-soil horizons
evalMissingData <- function(x, vars, name='hzname', p='Cr|R|Cd') {
# apply missing data fraction eval to each profile
md <- profileApply(x, .getMissingDataFraction, v=vars, n=name, p=p)
return(md)
}

56 changes: 27 additions & 29 deletions R/profile_compare.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@

## TODO: determine practical significance of 'filter' argument


## low-level function that the user will probably not ever use directly
# Seems to scale to 1000 profiles with 5 variables, could use optimization
# function requires at least two attributes
# hard coded reference to s$id
# set k to 0 for no depth weighting
pc <- function(s, vars, max_d, k, filter=NULL, sample_interval=NA, replace_na=TRUE, add_soil_flag=TRUE, return_depth_distances=FALSE, strict_hz_eval=FALSE, progress='none', plot.depth.matrix=FALSE,
rescale.result=FALSE, verbose=FALSE) {
pc <- function(s, vars, max_d, k, filter=NULL, sample_interval=NA, replace_na=TRUE,
add_soil_flag=TRUE, return_depth_distances=FALSE, strict_hz_eval=FALSE, progress='none',
plot.depth.matrix=FALSE, rescale.result=FALSE, verbose=FALSE) {

# currently this will only work with integer depths
# test by attempting to cast to integers
Expand All @@ -51,33 +53,6 @@ rescale.result=FALSE, verbose=FALSE) {
s <- s[filter, ]
}

## TODO: put this into its own function
## TODO: weight result using horizon thickness
# iterate over profiles and compute percent missing data by variable
pct_missing <- ddply(s, 'id', .fun=function(i, v=vars) {
## TODO: rows.to.review may not be needed
# only evaluate missing data within horizons that aren't completly NA (Oi, Cr, R horizons)
# all.missing.test <- apply(sapply(i[, v], is.na), 1, all)
# rows.to.review <- which( ! all.missing.test )
round(sapply(i[, v], function(j) length(which(is.na(j)))) / nrow(i) * 100)
})

# keep track of profiles missing any or all of their data
problem.profiles.idx <- which(apply(pct_missing[, vars], 1, function(i) any(i > 0)) )
bad.profiles.idx <- which(apply(pct_missing[, vars], 1, function(i) all(i == 100)) )

if(length(problem.profiles.idx) > 0) {
assign('problem.profiles', value=pct_missing[problem.profiles.idx,], envir=aqp.env)
message('Missing data will bias results, check inputs.\nPercent missing:')
print(pct_missing[problem.profiles.idx, ])
}

if(length(bad.profiles.idx) > 0) {
# stop stop and let the user know
bad.profiles <- unique(s$id)[bad.profiles.idx]
stop(paste('no non-NA values associated with profiles:', paste(bad.profiles, collapse=', '), '\nConsider removing these profiles and re-running.'), call.=FALSE)
}

# identify the number of profiles
# n.profiles <- length(s)
n.profiles <- length(unique(s$id))
Expand Down Expand Up @@ -316,6 +291,29 @@ rescale.result=FALSE, verbose=FALSE) {
pc.SPC <- function(s, vars, rescale.result=FALSE, ...){
# default behavior: do not normalize D

## 2016-02-22: check for missing data moved from pc() to here
## TODO: this makes an assumption on the column containing horizon designations
# iterate over profiles and compute percent missing data by variable
pct_data <- evalMissingData(s, vars)

## TODO: review this, and make optional via argument
# keep track of profiles missing any or all of their data
problem.profiles.idx <- which(pct_data < 1)
# bad.profiles.idx <- which(pct_data == 0)

if(length(problem.profiles.idx) > 0) {
# assign('problem.profiles', value=pct_missing[problem.profiles.idx,], envir=aqp.env)
message('Missing data will bias results, check inputs.')
}


## 2016-02-22: disabled for now
# if(length(bad.profiles.idx) > 0) {
# # stop stop and let the user know
# bad.profiles <- profile_id(s)[bad.profiles.idx]
# stop(paste('no non-NA values associated with profiles:', paste(bad.profiles, collapse=', '), '\nConsider removing these profiles and re-running.'), call.=FALSE)
# }

# extract horizons
s.hz <- horizons(s)

Expand Down
41 changes: 41 additions & 0 deletions man/evalMissingData.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
\name{evalMissingData}
\alias{evalMissingData}

\title{Evaluate Missing Data}
\description{Evaluate missing data in a SoilProfileCollection object}
\usage{
evalMissingData(x, vars, name = "hzname", p = "Cr|R|Cd")
}

\arguments{
\item{x}{a \code{SoilProfileCollection} object}
\item{vars}{a chatacter vector naming horizon-level attributes in \code{x}}
\item{name}{the name of a horizon-level attribute where horizon designations are stored}
\item{p}{REGEX pattern used to match non-soil horizons}
}

\details{Data completeness is evaluated by profile, based on the thickness of horizons with complete horizon-level attribute values (specified in \code{vars}) divided by the total thickness. The default REGEX pattern, \code{p}, should catch most non-soil horizons which are excluded from the evaluation.}

\value{A vector values ranging from 0 to 1, representing the percentage of non-NA data (as specified in \code{vars}) for each profile.}

\author{D.E. Beaudette}


\examples{
# example data
data(sp2)
# init SPC object
depths(sp2) <- id ~ top + bottom
# compute data completeness
sp2$data.complete <- evalMissingData(sp2, vars = c('r', 'g', 'b'), name = 'name')
# rank
new.order <- order(sp2$data.complete)
# plot along data completeness ranking
plot(sp2, plot.order=new.order)
# add axis, note re-ordering of axis labels
axis(side=1, at=1:length(sp2), labels = round(sp2$data.complete[new.order], 2),
line=-2, cex.axis=0.75)
}

\keyword{manip}

0 comments on commit 4fe1261

Please sign in to comment.