Skip to content

Commit acd5981

Browse files
committed
implement workflow
1 parent 53849d2 commit acd5981

13 files changed

+595
-21
lines changed

.gitignore

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
.Rproj.user
2+
.Rhistory
3+
.RData
4+
.Ruserdata
5+
6+
data/gbif sets
7+
data/geonames

config.ini

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[source]
2+
data = data/gbif sets/0167495-230224095556074/occurrence.txt
3+
property = locality
4+
columns = src/static/gbif-columns.txt
5+
wikifile = data/geonames/allCountries.txt
6+
data_type = DwC-A
7+
8+
[matching]
9+
cores = 12+
10+
rmode = all
11+
12+
[export]
13+
dwc_geo = true
14+
ambiguous = true
15+
fst = true
16+
dissco = false
17+
institution_qid = Q3052500
18+
19+
[rebuild]
20+
filename = data/output/fst/.fst

geonames-matching.Rproj

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
Version: 1.0
2+
3+
RestoreWorkspace: Default
4+
SaveWorkspace: Default
5+
AlwaysSaveHistory: Default
6+
7+
EnableCodeIndexing: Yes
8+
UseSpacesForTab: Yes
9+
NumSpacesForTab: 2
10+
Encoding: UTF-8
11+
12+
RnwWeave: Sweave
13+
LaTeX: pdfLaTeX

run.R

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Check if all required packages are installed. If not, install them.
2+
source("src/pkg.R")
3+
pkgLoad()
4+
5+
# Load parameters for the workflow
6+
library(ini)
7+
config = read.ini("config.ini")
8+
9+
# Import the person name strings to match and parse the names for later testing
10+
source("src/extract_strings.R")
11+
## Keep the data in memory to connect the matched strings back to specimens
12+
## after matching
13+
data = extract_strings(path = config$source$data, # data file location
14+
columns_list = config$source$columns, # properties to import
15+
property = config$source$property, # property with the names
16+
data_type = config$source$data_type) # type of data file
17+
18+
## Parse the name strings into first, last name, initials and
19+
## try to interpret different syntaxes and teams using the dwc_agent ruby gem
20+
parsed_names = parse_strings(data,
21+
config$source$property)
22+
23+
# Import geonames data
24+
source("src/import_geonames.R")
25+
geonames = import_geonames(config$source$wikifile)
26+
27+
source("src/matching.R")
28+
## Determine the set of cores that can be used on this machine for
29+
## parallel computing
30+
cores = assess_cores(config$matching$cores)
31+
32+
matching_results = match_wrapper(parsed_names,
33+
geonames,
34+
cores,
35+
config$matching$rmode)
36+
37+
## Filter the matches by a set of rules
38+
## Also convert to a tibble for easier exporting of results
39+
processed_results = matches_process(matching_results,
40+
parsed_names)
41+
42+
# Export the matched names into the specified export format
43+
source("src/export.R")
44+
processed_results %>%
45+
export(data = data,
46+
property = config$source$property,
47+
foldername = config$source$data,
48+
export_type = config$export)

src/export.R

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
export <- function(match_results,
2+
data,
3+
property,
4+
foldername,
5+
export_type) {
6+
7+
if (export_type$dwc_geo == "true") {
8+
match_results %>%
9+
export_to_dwc_geo(data,
10+
property,
11+
foldername,
12+
"geo")
13+
}
14+
15+
if (export_type$ambiguous == "true") {
16+
match_results %>%
17+
ambiguous_results(omit = F) %>%
18+
export_to_dwc_geo(data,
19+
property,
20+
foldername,
21+
"ambiguous-geo")
22+
}
23+
24+
if (export_type$fst == "true") {
25+
match_results %>%
26+
save_fst(foldername)
27+
}
28+
29+
if (export_type$dissco == "true") {
30+
match_results %>%
31+
export_dissco_annotation(data,
32+
property,
33+
foldername)
34+
}
35+
}
36+
37+
export_to_dwc_geo <- function(match_results,
38+
data,
39+
property,
40+
foldername,
41+
export_type) {
42+
match_results %<>%
43+
left_join(data,
44+
by = c("locid" = property),
45+
relationship = "many-to-many") %>%
46+
mutate(locationID = paste0("htts://www.geonames.org/",
47+
geonameid),
48+
locationRemarks = paste0("Score: ",
49+
score,
50+
", # of matches: ",
51+
n,
52+
", Geonames label: ",
53+
name),
54+
) %>%
55+
select(gbifID,
56+
occurrenceID,
57+
locationID,
58+
!!property,
59+
countryCode,
60+
locationRemarks)
61+
62+
filename = foldername %>%
63+
generate_filename(export_type,
64+
"txt")
65+
write_tsv(match_results,filename)
66+
}
67+
68+
export_dissco_annotation <- function(match_results,
69+
data,
70+
property,
71+
foldername) {
72+
require(uuid)
73+
require(jsonlite)
74+
match_results %<>%
75+
left_join(data,
76+
by = c("locid" = property),
77+
relationship = "many-to-many")
78+
res = vector("list", dim(match_results)[1])
79+
max = max(match_results$score)
80+
81+
for (i in 1:dim(match_results)[1]) {
82+
guid = UUIDgenerate()
83+
res[[i]]$data = list(id = guid,
84+
type = "Annotation",
85+
attribution = list(id = guid,
86+
version = 1,
87+
type = "Annotation",
88+
motivation = "linking",
89+
target = list(id = match_results$gbifID[i],
90+
type = "digital_specimen",
91+
indvProp = "dwc:locationID"),
92+
body = list(type = "dwc:locationID",
93+
value = paste0("https://www.geonames.org/",
94+
match_results$geonameid[i]),
95+
description = paste0("geonames label: ",
96+
match_results$name[i]),
97+
score = match_results$score[i]/max)))
98+
}
99+
100+
resp = toJSON(res,
101+
pretty = T,
102+
auto_unbox = T)
103+
104+
filename = foldername %>%
105+
generate_filename("dissco",
106+
"json")
107+
108+
write(resp,filename)
109+
}
110+
111+
generate_filename <- function(foldername,
112+
type,
113+
extension) {
114+
timestamp = Sys.time() %>%
115+
as.character() %>%
116+
gsub("\\..*","",.) %>%
117+
gsub(":",".",.) %>%
118+
gsub(" ","_",.)
119+
120+
dir = type %>%
121+
paste0("data/output/",.)
122+
123+
foldername %<>%
124+
gsub("/occurrence.txt","",.,fixed = T) %>%
125+
gsub(".*/","",.) %>%
126+
paste0(dir,
127+
"/",
128+
.,
129+
"_",
130+
timestamp,
131+
".",
132+
extension)
133+
134+
if (!dir.exists(dir)) {
135+
dir.create(dir)
136+
}
137+
138+
return(foldername)
139+
}
140+
141+
save_fst <- function(df,
142+
foldername) {
143+
require(fst)
144+
filename = foldername %>%
145+
generate_filename("fst",
146+
"fst")
147+
write_fst(df,filename)
148+
}
149+
150+
ambiguous_results <- function(match_results,
151+
omit) {
152+
ambiguous = match_results %>%
153+
filter(n > 1)
154+
if (omit) {
155+
match_results %<>%
156+
filter(!locid%in%ambiguous$locid)
157+
} else {
158+
match_results %<>%
159+
filter(locid%in%ambiguous$locid)
160+
}
161+
return(match_results)
162+
}

src/extract_strings.R

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
extract_strings <- function(path,
2+
columns_list,
3+
property,
4+
data_type) {
5+
# path = (relative) path to where the data file(s) can be found
6+
# columns_list = path to a file listing colnames to import
7+
# dwc_property = colname which contains the name strings to match
8+
# data_type = format of the data file(s)
9+
## "DwC-A" = a Darwin Core Archive (unzipped). occurrence.txt will be used
10+
## "dissco" = a JSON document as exported from the DiSSCo sandbox
11+
require(tidyverse)
12+
columns = readLines(columns_list,
13+
warn = F) %>%
14+
c(property)
15+
16+
if (data_type == "DwC-A") {
17+
data = read_tsv(path,
18+
quote="",
19+
col_select = all_of(columns),
20+
col_types = cols(.default = "c"))
21+
}
22+
if (data_type == "dissco") {
23+
require(jsonlite)
24+
raw = fromJSON(path,simplifyVector = F)
25+
data = tibble(!!property := sapply(raw,
26+
function(x)
27+
x$data$attributes$originalData[[paste0("dwc:",
28+
sym(property))]]),
29+
countryCode = sapply(raw,
30+
function(x)
31+
ifelse(!is.null(x$data$attributes$originalData$`dwc:countryCode`),
32+
x$data$attributes$originalData$`dwc:countryCode`,
33+
NA)),
34+
occurrenceID = sapply(raw,
35+
function(x)
36+
x$data$attributes$physicalSpecimenId),
37+
gbifID = sapply(raw,
38+
function(x)
39+
x$data$attributes$id))
40+
}
41+
return(data)
42+
}
43+
44+
parse_strings <- function(data,
45+
property) {
46+
require(magrittr)
47+
48+
unknowns = readLines("src/static/unknowns.txt",
49+
warn = F)
50+
51+
parsed_names = data %>%
52+
count(!!sym(property),
53+
countryCode) %>%
54+
filter(!is.na(!!sym(property)),
55+
!(!!sym(property)%in%unknowns),
56+
!is.na(countryCode)) %>%
57+
mutate(locid = !!sym(property)) %>%
58+
separate_rows(!!sym(property),
59+
sep=",|;| -|:|\\(|/|\'|\"") %>%
60+
mutate(chunk = gsub("[^a-z]",
61+
"",
62+
tolower(!!sym(property)))) %>%
63+
filter(chunk!="") %>%
64+
mutate(checkid1 = paste0(chunk,countryCode),
65+
checkid2 = paste0(chunk,locid)) %>%
66+
rownames_to_column("rownr")
67+
68+
return(parsed_names)
69+
}

geonames-matching.R src/geonames-matching.R

+7-21
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,22 @@
11
library(tidyverse)
22
library(magrittr)
33

4-
setwd("D:/apm/geonames")
54

65
##
76
###geonames raw data
87
##
9-
data = read_tsv("allCountries.txt",
8+
data = read_tsv("data/geonames/allCountries.txt",
109
col_names = F,
1110
col_types = cols(.default = "c"),
1211
quote = "")
1312

1413
#set colnames
15-
colnames(data) = c("geonameid",
16-
"name",
17-
"asciiname",
18-
"alternatenames",
19-
"latitude",
20-
"longitude",
21-
"feature class",
22-
"feature code",
23-
"country code",
24-
"cc2",
25-
"admin1 code",
26-
"admin2 code",
27-
"admin3 code",
28-
"admin4 code",
29-
"population",
30-
"elevation",
31-
"dem",
32-
"timezone",
33-
"modification date")
14+
colnames = read_delim("data/geonames/colnames.txt",
15+
col_names = F,
16+
delim=" : ") %>%
17+
mutate(cols = trimws(X1))
18+
19+
colnames(data) = colnames$cols
3420

3521
# geonames ids for BGBM specimens
3622
# bgbm = read_csv("Botanical-Data-Export-Mathias.csv",

src/import_geonames.R

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import_geonames <- function(path) {
2+
geonames = read_tsv(path,
3+
col_names = F,
4+
col_types = cols(.default = "c"),
5+
quote = "")
6+
7+
colnames = read_delim("data/geonames/colnames.txt",
8+
col_names = F,
9+
delim=" : ") %>%
10+
mutate(cols = trimws(X1))
11+
12+
colnames(geonames) = colnames$cols
13+
14+
return(geonames)
15+
}

0 commit comments

Comments
 (0)