-
Notifications
You must be signed in to change notification settings - Fork 36
/
_spelling.R
executable file
·112 lines (103 loc) · 3.33 KB
/
_spelling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env Rscript
suppressPackageStartupMessages({
library("R6")
library("pandocfilters")
library("dplyr")
})
pandoc_to_json <- function(file, from = "markdown") {
args <- sprintf("-f %s -t json %s", from, file)
out <- system2("pandoc", args, stdout = TRUE)
jsonlite::fromJSON(out, simplifyVector = FALSE, simplifyDataFrame = FALSE,
simplifyMatrix = FALSE)
}
# https://stackoverflow.com/questions/2436688/append-an-object-to-a-list-in-r-in-amortized-constant-time-o1
# https://stackoverflow.com/questions/29461530/efficient-linked-list-ordered-set-in-r/29482211#29482211
ExpandingList <- R6Class("ExpandingList",
public = list(
initialize = function() {
private$data <- rlang::new_environment()
},
add = function(val) {
n <- length(private$data)
private$data[[as.character(n + 1L)]] <- val
invisible(self)
},
as.list = function() {
x <- as.list(private$data, sorted = FALSE)
x[order(as.numeric(names(x)))]
}
),
private = list(
data = NULL
)
)
is_url <- function(x) {
stringr::str_detect(x, stringr::regex("^(https?|doi):", ignore_case = TRUE))
}
stringify <- function(x, meta) {
results <- ExpandingList$new()
go <- function(key, value, ...) {
if (key %in% c("Str", "MetaString")) {
if (!is_url(value)) {
results$add(value)
}
} else if (key %in% c("Code", "Math", "RawInline", "Cite")) {
list()
}
}
x <- astrapply(x, go)
purrr::flatten_chr(results$as.list())
}
parse_text_md <- function(path, from = "markdown") {
x <- pandoc_to_json(path, from = from)
stringr::str_c(stringify(x), collapse = " ")
}
normalize_lang <- function(lang = NULL){
if (!length(lang) || !nchar(lang)) {
message(str_c("DESCRIPTION does not contain 'Language' field. ",
"Defaulting to 'en-US'."))
lang <- "en-US"
}
if (tolower(lang) == "en" || tolower(lang) == "eng") {
message("Found ambiguous language 'en'. Defaulting to 'en-US")
lang <- "en-US"
}
if (nchar(lang) == 2) {
oldlang <- lang
lang <- paste(tolower(lang), toupper(lang), sep = "_")
message(sprintf("Found ambiguous language '%s'. Defaulting to '%s'",
oldlang, lang))
}
lang <- gsub("-", "_", lang, fixed = TRUE)
parts <- strsplit(lang, "_", fixed = TRUE)[[1]]
parts[1] <- tolower(parts[1])
parts[-1] <- toupper(parts[-1])
paste(parts, collapse = "_")
}
spell_check_pandoc_one <- function(path, dict) {
text <- parse_text_md(path)
bad_words <- purrr::flatten_chr(hunspell::hunspell(text, dict = dict))
out <- tibble::tibble(words = bad_words) %>%
count(words) %>%
rename(count = n)
if (nrow(out) > 0) {
out[["path"]] <- path
}
out
}
spell_check_pandoc <- function(path, ignore = character(), lang = "en_US") {
stopifnot(is.character(ignore))
lang <- normalize_lang(lang)
dict <- hunspell::dictionary(lang, add_words = ignore)
path <- normalizePath(path, mustWork = TRUE)
purrr::map_df(sort(path), spell_check_pandoc_one, dict = dict) %>%
group_by(path, words) %>%
summarise(count = sum(count)) %>%
arrange(path, words) %>%
ungroup() %>%
mutate(path = basename(path))
}
files <- c(dir(here::here(), pattern = "\\.(Rmd)"),
here::here("README.md"))
ignore <- readLines(here::here("WORDLIST"))
print(spell_check_pandoc(files, ignore = ignore), n = 100)