-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocrpdf.sh
executable file
·91 lines (73 loc) · 2.45 KB
/
ocrpdf.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#! /bin/bash
set -e
density=300
function usage() {
echo "Usage: $0 INPUT.pdf OUTPUT.pdf" >&2
exit 1
}
if (( $# == 2 )) && ! [[ "$1" == -* ]]; then
input="$1"
else
usage
fi
if ! [[ -r "${input}" ]]; then
echo "Cannot read input file ${input}." >&2
exit 1
fi
output="$2"
if [[ -e "${output}" ]]; then
echo "Output file ${output} already exists." >&2
exit 1
fi
# Redact the PDF by clicking the pen icon and selecting the "ab" text icon and
# pasting the unicode full block character 0x2588.
# To flatten the redacted PDF, print it to a Postscript. This rasterizes the
# graphics as some unfortunate side effect.
# Print the PDF to a Postscript printer such as Microsoft PS Class Driver
# attached to a FILE: port with a Print Processor set to "winprint/RAW".
# Let's say the redacted multi-page image is stored in redacted.ps.
# Split the resulting Postscript file redacted.ps into images suitable for
# tesseract-ocr (PNG) using imagemagick.
# Then OCR the PNGs to PDF pages and merge the pages using Ghostscript.
echo "Converting ${input} to raster pages ..."
rm -f output-page*.png
numpages=0
while read -r line ; do
echo "${line}"
if [[ "${line}" =~ ^Page\ ([0-9]+)$ ]]; then
numpages="${BASH_REMATCH[1]}"
fi
done < <(gs -dNOPAUSE -dBATCH -sDEVICE=png16m -sOutputFile=output-page-%03d.png \
-r${density} -dTextAlphaBits=4 -dGraphicsAlphaBits=4 "${input}" 2>&1)
if ! (( numpages )); then
echo "No pages found." >&2
exit 1
fi
# echo "Identifying the number of pages ..."
# maxpage=0
# while read -r line; do
# # echo "${line}"
# if [[ "${line}" =~ \[([0-9]+)\] ]] ; then
# # echo "${BASH_REMATCH[1]}"
# maxpage="${BASH_REMATCH[1]}"
# fi
# done < <(identify output.ps)
# echo "Splitting into rasterized pages ..."
# convert -density ${density} -units PixelsPerInch "output.ps[0-$((numpages-1))]" "output-page.png"
rm -f output-page*.pdf
for ((i=1;i<=numpages;i++)); do
p="$(printf "%03d" $i)"
echo "OCR of page ${p}"
tesseract "output-page-${p}.png" "output-page-${p}" -l eng pdf
done
echo "Merging all ${numpages} OCRed PDF pages to output-flat.pdf ..."
allp=$(for ((i=1;i<=numpages;i++)); do printf "output-page-%03d.pdf\n" $i; done)
rm -f output-flat.pdf
gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite \
-r${density} \
-sOutputFile=output-flat.pdf ${allp}
rm -f output-page*
echo -n "Creating ${output} ..."
cp -a output-flat.pdf "${output}"
rm -f output-flat.pdf
echo " done."