AgentschapPlantentuinMeise
diff --git a/‎.gitignore
+7 b/‎.gitignore
+7
diff --git a/‎config.ini
+20 b/‎config.ini
+20
diff --git a/‎geonames-matching.Rproj
+13 b/‎geonames-matching.Rproj
+13
diff --git a/‎run.R
+48 b/‎run.R
+48
diff --git a/‎src/export.R
+162 b/‎src/export.R
+162
diff --git a/‎src/extract_strings.R
+69 b/‎src/extract_strings.R
+69
diff --git a/‎geonames-matching.R ‎src/geonames-matching.R
+7-21 b/‎geonames-matching.R ‎src/geonames-matching.R
+7-21
diff --git a/‎src/import_geonames.R
+15 b/‎src/import_geonames.R
+15
@@ -0,0 +1,7 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+
+data/gbif sets
+data/geonames
@@ -0,0 +1,20 @@
+[source]
+data = data/gbif sets/0167495-230224095556074/occurrence.txt
+property = locality
+columns = src/static/gbif-columns.txt
+wikifile = data/geonames/allCountries.txt
+data_type = DwC-A
+
+[matching]
+cores = 12+
+rmode = all
+
+[export]
+dwc_geo = true
+ambiguous = true
+fst = true
+dissco = false
+institution_qid = Q3052500
+
+[rebuild]
+filename = data/output/fst/.fst
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
@@ -0,0 +1,48 @@
+# Check if all required packages are installed. If not, install them.
+source("src/pkg.R")
+pkgLoad()
+
+# Load parameters for the workflow
+library(ini)
+config = read.ini("config.ini")
+
+# Import the person name strings to match and parse the names for later testing
+source("src/extract_strings.R")
+## Keep the data in memory to connect the matched strings back to specimens
+## after matching
+data = extract_strings(path = config$source$data, # data file location
+                       columns_list = config$source$columns, # properties to import
+                       property = config$source$property, # property with the names
+                       data_type = config$source$data_type) # type of data file
+
+## Parse the name strings into first, last name, initials and
+## try to interpret different syntaxes and teams using the dwc_agent ruby gem
+parsed_names = parse_strings(data,
+                             config$source$property)
+
+# Import geonames data
+source("src/import_geonames.R")
+geonames = import_geonames(config$source$wikifile)
+
+source("src/matching.R")
+## Determine the set of cores that can be used on this machine for
+## parallel computing
+cores = assess_cores(config$matching$cores)
+
+matching_results = match_wrapper(parsed_names,
+                                 geonames,
+                                 cores,
+                                 config$matching$rmode)
+
+## Filter the matches by a set of rules
+## Also convert to a tibble for easier exporting of results
+processed_results = matches_process(matching_results,
+                                    parsed_names)
+
+# Export the matched names into the specified export format
+source("src/export.R")
+processed_results %>%
+  export(data = data,
+         property = config$source$property,
+         foldername = config$source$data,
+         export_type = config$export)
@@ -0,0 +1,162 @@
+export <- function(match_results,
+                   data,
+                   property,
+                   foldername,
+                   export_type) {
+  
+  if (export_type$dwc_geo == "true") {
+    match_results %>%
+      export_to_dwc_geo(data,
+                        property,
+                        foldername,
+                        "geo")
+  }
+  
+  if (export_type$ambiguous == "true") {
+    match_results %>%
+      ambiguous_results(omit = F) %>%
+      export_to_dwc_geo(data,
+                        property,
+                        foldername,
+                        "ambiguous-geo")
+  }
+  
+  if (export_type$fst == "true") {
+    match_results %>%
+      save_fst(foldername)
+  }
+  
+  if (export_type$dissco == "true") {
+    match_results %>%
+      export_dissco_annotation(data,
+                               property,
+                               foldername)
+  } 
+}
+
+export_to_dwc_geo <- function(match_results,
+                              data,
+                              property,
+                              foldername,
+                              export_type) {
+  match_results %<>%
+    left_join(data,
+              by = c("locid" = property),
+              relationship = "many-to-many") %>%
+    mutate(locationID = paste0("htts://www.geonames.org/",
+                               geonameid),
+           locationRemarks = paste0("Score: ",
+                                       score,
+                                    ", # of matches: ",
+                                    n,
+                                    ", Geonames label: ",
+                                    name),
+           ) %>%
+    select(gbifID,
+           occurrenceID,
+           locationID,
+           !!property,
+           countryCode,
+           locationRemarks)
+  
+  filename = foldername %>%
+    generate_filename(export_type,
+                      "txt")
+  write_tsv(match_results,filename)
+}
+
+export_dissco_annotation <- function(match_results,
+                                     data,
+                                     property,
+                                     foldername) {
+  require(uuid)
+  require(jsonlite)
+  match_results %<>%
+    left_join(data,
+              by = c("locid" = property),
+              relationship = "many-to-many")
+  res = vector("list", dim(match_results)[1])
+  max = max(match_results$score)
+  
+  for (i in 1:dim(match_results)[1]) {
+    guid = UUIDgenerate()
+    res[[i]]$data = list(id = guid,
+                         type = "Annotation",
+                         attribution = list(id = guid,
+                                            version = 1,
+                                            type = "Annotation",
+                                            motivation = "linking",
+                                            target = list(id = match_results$gbifID[i],
+                                                          type = "digital_specimen",
+                                                          indvProp = "dwc:locationID"),
+                                            body = list(type = "dwc:locationID",
+                                                        value = paste0("https://www.geonames.org/",
+                                                                       match_results$geonameid[i]),
+                                                        description = paste0("geonames label: ",
+                                                                             match_results$name[i]),
+                                                        score = match_results$score[i]/max)))
+  }
+  
+  resp = toJSON(res,
+                pretty = T,
+                auto_unbox = T)
+  
+  filename = foldername %>%
+    generate_filename("dissco",
+                      "json")
+  
+  write(resp,filename)
+}
+
+generate_filename <- function(foldername,
+                              type,
+                              extension) {
+  timestamp = Sys.time() %>%
+    as.character() %>%
+    gsub("\\..*","",.) %>%
+    gsub(":",".",.) %>%
+    gsub(" ","_",.)
+  
+  dir = type %>%
+    paste0("data/output/",.)
+  
+  foldername %<>%
+    gsub("/occurrence.txt","",.,fixed = T) %>%
+    gsub(".*/","",.) %>%
+    paste0(dir,
+           "/",
+           .,
+           "_",
+           timestamp,
+           ".",
+           extension)
+  
+  if (!dir.exists(dir)) {
+    dir.create(dir)
+  }
+  
+  return(foldername)
+}
+
+save_fst <- function(df,
+                     foldername) {
+  require(fst)
+  filename = foldername %>%
+    generate_filename("fst",
+                      "fst")
+  write_fst(df,filename)
+}
+
+ambiguous_results <- function(match_results,
+                              omit) {
+  ambiguous = match_results %>%
+    filter(n > 1)
+  if (omit) {
+    match_results %<>%
+      filter(!locid%in%ambiguous$locid)
+  } else {
+    match_results %<>%
+      filter(locid%in%ambiguous$locid)
+  }
+  return(match_results)
+}
@@ -0,0 +1,69 @@
+extract_strings <- function(path,
+                            columns_list,
+                            property,
+                            data_type) {
+  # path = (relative) path to where the data file(s) can be found
+  # columns_list = path to a file listing colnames to import
+  # dwc_property = colname which contains the name strings to match
+  # data_type = format of the data file(s)
+  ## "DwC-A" = a Darwin Core Archive (unzipped). occurrence.txt will be used
+  ## "dissco" = a JSON document as exported from the DiSSCo sandbox
+  require(tidyverse)
+  columns = readLines(columns_list,
+                      warn = F) %>%
+    c(property)
+  
+  if (data_type == "DwC-A") {
+    data = read_tsv(path,
+                    quote="",
+                    col_select = all_of(columns),
+                    col_types = cols(.default = "c"))
+  }
+  if (data_type == "dissco") {
+    require(jsonlite)
+    raw = fromJSON(path,simplifyVector = F)
+    data = tibble(!!property := sapply(raw,
+                                       function(x) 
+                                         x$data$attributes$originalData[[paste0("dwc:",
+                                                                                sym(property))]]),
+                  countryCode = sapply(raw,
+                                       function(x) 
+                                         ifelse(!is.null(x$data$attributes$originalData$`dwc:countryCode`),
+                                                x$data$attributes$originalData$`dwc:countryCode`,
+                                                NA)),
+                  occurrenceID = sapply(raw,
+                                        function(x) 
+                                          x$data$attributes$physicalSpecimenId),
+                  gbifID = sapply(raw,
+                                  function(x) 
+                                    x$data$attributes$id))
+  }
+  return(data) 
+}
+
+parse_strings <- function(data,
+                          property) {
+  require(magrittr)
+  
+  unknowns = readLines("src/static/unknowns.txt",
+                       warn = F)
+  
+  parsed_names = data %>%
+    count(!!sym(property),
+          countryCode) %>%
+    filter(!is.na(!!sym(property)),
+           !(!!sym(property)%in%unknowns),
+           !is.na(countryCode)) %>%
+    mutate(locid = !!sym(property)) %>%
+    separate_rows(!!sym(property),
+                  sep=",|;| -|:|\\(|/|\'|\"") %>%
+    mutate(chunk = gsub("[^a-z]",
+                        "",
+                        tolower(!!sym(property)))) %>%
+    filter(chunk!="") %>%
+    mutate(checkid1 = paste0(chunk,countryCode),
+           checkid2 = paste0(chunk,locid)) %>%
+    rownames_to_column("rownr")
+  
+  return(parsed_names)
+}
@@ -1,36 +1,22 @@
 library(tidyverse)
 library(magrittr)
 
-setwd("D:/apm/geonames")
 
 ##
 ###geonames raw data
 ##
-data = read_tsv("allCountries.txt",
+data = read_tsv("data/geonames/allCountries.txt",
                 col_names = F,
                 col_types = cols(.default = "c"),
                 quote = "")
 
 #set colnames
-colnames(data) = c("geonameid",
-                   "name",
-                   "asciiname",
-                   "alternatenames",
-                   "latitude",
-                   "longitude",
-                   "feature class",
-                   "feature code",
-                   "country code",
-                   "cc2",
-                   "admin1 code",
-                   "admin2 code",
-                   "admin3 code",
-                   "admin4 code",
-                   "population",
-                   "elevation",
-                   "dem",
-                   "timezone",
-                   "modification date")
+colnames = read_delim("data/geonames/colnames.txt",
+                      col_names = F,
+                      delim=" : ") %>%
+  mutate(cols = trimws(X1))
+
+colnames(data) = colnames$cols
 
 # geonames ids for BGBM specimens
 # bgbm = read_csv("Botanical-Data-Export-Mathias.csv",
 
@@ -0,0 +1,15 @@
+import_geonames <- function(path) {
+  geonames = read_tsv(path,
+                      col_names = F,
+                      col_types = cols(.default = "c"),
+                      quote = "")
+  
+  colnames = read_delim("data/geonames/colnames.txt",
+                        col_names = F,
+                        delim=" : ") %>%
+    mutate(cols = trimws(X1))
+  
+  colnames(geonames) = colnames$cols
+  
+  return(geonames)
+}