-
Notifications
You must be signed in to change notification settings - Fork 0
/
Functions for adjusted counting.R
110 lines (75 loc) · 3.63 KB
/
Functions for adjusted counting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#### Functions
#### Cf http://www.cookbook-r.com/Manipulating_data/Converting_between_data_frames_and_contingency_tables/
# Convert from data frame of counts to data frame of cases.
# `countcol` is the name of the column containing the counts
countsToCases <- function(x, countcol = "Freq") {
# Get the row indices to pull from x
idx <- rep.int(seq_len(nrow(x)), x[[countcol]])
# Drop count column
x[[countcol]] <- NULL
# Get the rows from x
x[idx, ]
}
#### Function for prevalence estimation via Adjusted Counting (AC) ####
### https://github.com/MartinLuquetEcology/Insect-feeding-inference.git ###
#### See Forman et al. (2008), Sebastiani (2015), Gonzalez et al. (2017)
#### Works for any number of classes (but only tested for 2 and 3 classes)
#### "unadj" is a vector containing the unadjusted counts (estimated via Classify and Count)
#### "table" is the confusion matrix returned by a classifier
adjust.prev <- function(unadj,table){
#Let (C1, C2, C3...) be the real classes and (c1, c2, c3...) be the predicted classes
#Let p0(C1) be the "unadjusted" prevalence for C1 (estimated via counting predicted samples) and p(C1) be the real prevalence
#Then
#p0(C1) = p(C1) * p(c1|C1) + p(C2) * p(c1|C2) + p(C3) * p(c1|C3) + ...
#Same for p0(C2), p0(C3), ...
#We can estimate the true and false positive error rates using the training set
#We get "unadjusted" prevalences from the test set
#So to estimate p(C1), p(C2), etc. (i. e. to get the adjusted prevalences)
#We need to resolve the system "unadjusted prevalences" = "adjusted prevalences" * "error rates"
options(error = NULL)
# If one class has a true positive rate of 0, we can't use Adjusted Counting
# The classifier is probably too bad
if (sum(diag(table) == 0)>0) {stop(
"Prevalences cannot be adjusted if for at least one class, no individual was correctly detected in the training set",call.=F)
}
#Error rates matrix
rates <- matrix(0,nrow(table),ncol(table))
for (i in 1:nrow(table))
{
falserates <- table[i,-i]/sum(table[i,])
rates [-i,i] <- falserates
rates [i,i] <- 1-sum(falserates)
}
#Unadjusted prevalences matrix
nb.Ind <- sum(unadj)
if(class(try(solve(rates)))[1] != "matrix"){stop(
"Matrix is not solvable: impossible to adjust prevalences. Dataset may be too small or classifier is not adapted")
}
#Solving
est.vector <- solve(rates,unadj)
### If one class is estimated as <0%, it is corrected as being 0% (Forman 2008)
### Other classes are-readjusted so that they sum to nb.Ind
if (sum(est.vector>=0) != length(est.vector))
{
underZero <- rownames(table)[est.vector<0]
for (i in 1:length(underZero)){
cat(underZero[i],"class was estimated as negative - corrected as zero \n \n")
}
est.vector[est.vector<0] <- 0
est.vector[est.vector>0] <- (est.vector[est.vector>0] / sum(est.vector[est.vector>0])) * nb.Ind
}
# Adjusted prevalence values are stored here
adj.prev <- (round(est.vector))
names(adj.prev) <- rownames(table)
### If there is a 1 difference between estimated number of individuals
## We round the class that is closest to the superior/inferior unit
if(sum(adj.prev) - nb.Ind == 1) {
min.deci <- which.min(est.vector*nb.Ind-floor(est.vector*nb.Ind))
adj.prev[min.deci] <- adj.prev[min.deci]-1
}
if(sum(adj.prev) - nb.Ind == -1) {
max.deci <- which.max(est.vector*nb.Ind-floor(est.vector*nb.Ind))
adj.prev[max.deci] <- adj.prev[max.deci]+1
}
return(adj.prev)
}