-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpaper-keywords
executable file
·136 lines (110 loc) · 3.99 KB
/
paper-keywords
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/bin/bash
# This script reads a paper in some format (e.g. PDF)
# and attempts to produce a good set of keywords
# for searching for that paper in an online citation index
# or bibliography server, e.g. Google Scholar, ACM Portal, etc..
test -n "$1" || ( echo "You must specify a paper file to process." 1>&2; false) || exit 1
test -f "$1" || ( echo "$1 is not a file, or not a regular file." 1>&2; false) || exit 1
test -x `which aspell` || (echo "You must have an available aspell command." 1>&2; false) || exit 1
first_sane_words () {
# This just gives you the first ten words that
# don't have any weird characters or numbers in them.
head -n10 | \
tr '\n' ' ' | \
sed 's/[a-zA-Z0-9]*[0-9:)(&\.\/]\+[a-zA-Z0-9]*//g' | \
sed 's/ \+/\t/g' | \
cut -f 1-20 | \
tr '\t' ' '
}
complete_sentence () {
# This outputs the first complete sentence in the text
# containing at least 30 sane characters
sentence_lines | \
pass_line_if_candidate_sentence | \
rewrite_sentence_as_keyword
}
sentence_lines () {
tr -c '[:print:]' ' ' | \
sed 's/\([^A-Z]\.\)/\1\n/g'
}
#candidate_sentence_expression="^[ \t]*((([A-Z][a-z])|(A ))([A-Za-z]|[-' ]){75,150}\.)"
# The following nightmarish regex is intended to match
# "straightforward" English sentences. It deliberately
# avoids matching things which look like artifacts of pdftotext,
# for example one-letter words apart from 'a'.
candidate_sentence_expression="^[ \t]*((([A-Z][a-z])|(A )) *(([Aa] +)|[-A-Za-z0-9'\(\)]{2,} +){12,}([-A-Za-z0-9']{2,})\.)"
pass_line_if_candidate_sentence () {
egrep "$candidate_sentence_expression"
}
rewrite_sentence_as_keyword () {
sed -r "s#$candidate_sentence_expression#"'"\1"#'
}
sentence_lines_passing_spellcheck () {
sentence_lines "$1" | \
pass_line_if_spellchecks
}
pass_line_if_spellchecks () {
while read line; do
misspelled=$( echo "$line" | aspell list | tr '\n' ' ' )
#echo "misspelled words: $misspelled" 1>&2
test -z "$misspelled" && echo "$line"
done
}
complete_spellchecked_sentence () {
# This outputs the first complete sentence in the text
# containing at least 30 sane characters, and that
# passes the spellchecker
expression="^[ \t]*(([[:alpha:]]|[-' ]){30,120}\.)"
sentence_lines | \
pass_line_if_candidate_sentence | \
pass_line_if_spellchecks | \
rewrite_sentence_as_keyword
}
email_addresses () {
email_expression='[^-_a-zA-Z\.0-9]*\([-_a-zA-Z\.0-9]\+@[-_a-zA-Z\.0-9]\+\).*'
grep "$email_expression" | \
sed "s/${email_expression}/\1/"
}
words_as_lines() {
tr -c '[:graph:]' '\n' | tr -s '\n'
}
alternation_by_line () {
sed 's/\(.*\)/\1\nOR/' | head --lines=-1
}
PAPER="$1"
get_paper_text () {
MIMETYPE=$( file --mime-type -b "$1" )
case "$MIMETYPE" in
(application/pdf) # skip page 1, because it may be boilerplate
pdftotext -f 2 "$PAPER" /dev/stdout | tr -s '\n'
;;
(application/postscript)
which ps2pdf >/dev/null || (echo "Postscript file found but no ps2pdf." 1>&2; false) || exit 1
tmpfile=$(mktemp)
ps2pdf "$PAPER" "$tmpfile"
pdftotext -f 2 "$tmpfile" /dev/stdout
rm -f "$tmpfile"
;;
(*) echo "Sorry, mimetype $MIMETYPE is not yet supported!" 1>&2; exit 1
;;
esac
}
text=$( get_paper_text "$1" | tr -cs '[:print:]' '\n' )
test -n "$text" || (echo "Could not extract any text from ${PAPER}!" 1>&2; false) || exit 1
first_words=$( echo -n "$text" | first_sane_words )
first_sane_nonspellchecking_words=$( echo -n "$first_words" | aspell list | tr '\n' ' ' )
email_addresses=$( echo -n "$text" | email_addresses )
sentence=$( echo -n "$text" | complete_spellchecked_sentence )
#echo first: "$first_words"
#echo sane-nonsc: "$first_sane_nonspellchecking_words"
#echo email: "$email_addresses" 1>&2
#echo sent: "$sentence"
if test -n "$sentence"; then
#echo -n $( echo "$first_sane_nonspellchecking_words" | words_as_lines | alternation_by_line | tr '\n' ' ' )
#echo -n ' '
echo "$sentence" | rewrite_sentence_as_keyword
elif test -n "$first_words"; then
echo "$email_addresses" "$first_words" | tr '\n' ' '; echo
else
echo "Failed to extract keywords!" 1>&2; false
fi