-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3 split.R
60 lines (43 loc) · 2.26 KB
/
3 split.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
################################################################################
# 3 Split
#
# Search for nations in corpus & create training / validation data
################################################################################
# Dependencies ------------------------------------------------------------
if(!"pacman" %in% installed.packages()) install.packages("pacman")
pacman::p_load(tidyverse, here, lubridate, writexl)
# Load Data ---------------------------------------------------------------
archive <- read_csv(here("data", "press-archive.csv"))
lookup <- read_csv(here("data", "lookup.csv"))
# Find relevant cases -----------------------------------------------------
relevant <- archive %>%
mutate(datum = dmy(datum),
jahr = year(datum)) %>%
filter(
# only ~ 450 entries before 2017, coverage probably dubious -> remove these cases
jahr > 2016 & jahr < 2024,
# keep only press reports filed under category "crime"
kategorie == "Kriminalität") %>%
select(-jahr) %>%
mutate(
# for each press release, search the lookup table for keywords
# if anything is found, keep ids of the resulting dataframe
nations_lookup = map(text,
~ filter(lookup, str_detect(.x, keywords)) %>% pull(id)),
# match ids with the corresponding nations and concatenate in string, separating with comma
# (keeping a simple data structure)
nations_lookup = map_chr(nations_lookup,
function(x) if (length(x) == 0) NA_character_ else paste(x, collapse = ",")))
# Create training & validation samples ------------------------------------
# create reproducable training & split samples, drawing from all cases where keywords were found
set.seed(123)
indices <- sample(1:nrow(filter(relevant, !is.na(nations_lookup))), 100)
sample_cases <- relevant %>%
filter(!is.na(nations_lookup)) %>%
slice(indices)
training <- sample_cases %>% slice(1:50)
validation <- sample_cases %>% slice(51:100)
# Export ------------------------------------------------------------------
write_csv(relevant, here("data", "relevant.csv"))
write_xlsx(training, here("data", "training.xlsx"))
write_xlsx(validation, here("data", "validation.xlsx"))