Skip to content

Commit 53849d2

Browse files
committed
keep deduplicated list
1 parent f2dc067 commit 53849d2

File tree

1 file changed

+14
-13
lines changed

1 file changed

+14
-13
lines changed

geonames-matching.R

+14-13
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,13 @@ locs.split$check2 = paste0(locs.split$text,
107107

108108
#deduplicate the substrings per country,
109109
#to remove redundancy from the matching process
110-
locs.split %<>% filter(!duplicated(check))
110+
locs.split2 = locs.split %>%
111+
filter(!duplicated(check))
111112

112113
##
113114
###list all country codes of the locality list
114115
##
115-
countries = count(locs.split,
116+
countries = count(locs.split2,
116117
COUNTRY_CODE)
117118
countries %<>% arrange(desc(n))
118119

@@ -122,15 +123,15 @@ countries %<>% arrange(desc(n))
122123

123124
#slow!
124125
#per country, match a-z substrings of locality to geonames labels
125-
locs.split$geoid = NA
126-
locs.split$lat = NA
127-
locs.split$long = NA
128-
locs.split$geoname = NA
129-
locs.split$geoaltname = NA
130-
locs.split$cn = NA
131-
out = locs.split[1,]
126+
locs.split2$geoid = NA
127+
locs.split2$lat = NA
128+
locs.split2$long = NA
129+
locs.split2$geoname = NA
130+
locs.split2$geoaltname = NA
131+
locs.split2$cn = NA
132+
out = locs.split2[1,]
132133
for (i in 1:dim(countries)[1]) {
133-
apm = filter(locs.split,
134+
apm = filter(locs.split2,
134135
COUNTRY_CODE==countries$COUNTRY_CODE[i])
135136
geo = filter(data,
136137
`country code`==countries$COUNTRY_CODE[i])
@@ -249,9 +250,9 @@ exp3 = left_join(exp3,
249250
lat,
250251
long),
251252
by=c("LOCALITY"="locid"))
252-
write_tsv(exp3,
253-
"enriched specimen data.txt",
254-
na="")
253+
#write_tsv(exp3,
254+
# "enriched specimen data.txt",
255+
# na="")
255256

256257
##
257258
###overlap with BGBM geonames ids

0 commit comments

Comments
 (0)