Skip to content

Commit

Permalink
Improve accuracy of extractions
Browse files Browse the repository at this point in the history
  • Loading branch information
mgmeyers committed Jul 23, 2022
1 parent a45bc40 commit 56a1377
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 35 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ dist
.DS_Store
pdf-annots2json
pdfannots2json
*.pdf
*.pdf
debug
14 changes: 9 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
"golang.org/x/sync/errgroup"
)

const version = "v1.0.8"
const version = "v1.0.9"

var args struct {
Version kong.VersionFlag `short:"v" help:"Display the current version of pdfannots2json"`
Expand Down Expand Up @@ -118,7 +118,7 @@ func main() {
pageLabel, ok := pageLabelMap[i]

if !ok {
pageLabel = strconv.Itoa(i + 1)
pageLabel = strconv.Itoa(index + 1)
}

g.Go(func() error {
Expand Down Expand Up @@ -164,6 +164,10 @@ func main() {
filtered = append(filtered, a)
}

if len(filtered) == 0 {
return nil
}

var pageImg image.Image
var ocrImg image.Image

Expand Down Expand Up @@ -314,14 +318,14 @@ func processAnnotations(
return nil
}

bounds, o := pdfutils.GetBoundsFromAnnotMarks(anno, markRects)
_, o := pdfutils.GetBoundsFromAnnotMarks(anno, markRects)

if offset == -1 {
offset = o
top = int(math.Max(page.MediaBox.Height()-anno.Y.Hi, 0.0))
}

annotText, err := pdfutils.GetTextByAnnotBounds(fitzDoc, pageIndex, page, bounds)
annotText, err := pdfutils.GetTextByAnnotBounds(fitzDoc, pageIndex, page, anno)
endIfErr(err)

if str == "" {
Expand Down Expand Up @@ -360,7 +364,7 @@ func processAnnotations(
}

builtAnnot := &pdfutils.Annotation{
AnnotatedText: pdfutils.CondenseSpaces(pdfutils.ExpandLigatures(annotatedText)),
AnnotatedText: pdfutils.DeHyphen(pdfutils.CondenseSpaces(pdfutils.ExpandLigatures(annotatedText))),
Color: pdfutils.GetAnnotationColor(annotation),
ColorCategory: pdfutils.GetAnnotationColorCategory(annotation),
Comment: comment,
Expand Down
24 changes: 21 additions & 3 deletions pdfutils/geom.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ func ApplyPageRotation(page *model.PdfPage, rect []float64) []float64 {
return []float64{width - rect[2], height - rect[3], width - rect[0], height - rect[1]}
}

func IsWithinOverlapThresh(annot r2.Rect, mark r2.Rect) bool {
func IsWithinOverlapThresh(annot r2.Rect, mark r2.Rect, thresh float64) bool {
markSize := getArea(mark)
intersect := getArea(annot.Intersection(mark))

return intersect/markSize >= 0.5
return intersect/markSize >= thresh
}

func getArea(r r2.Rect) float64 {
Expand Down Expand Up @@ -161,6 +161,22 @@ func GetClosestMark(x float64, y float64, markRects []r2.Rect) int {
return closest
}

func scaleY(rect r2.Rect, by float64) r2.Rect {
clone := r2.EmptyRect()

clone.X.Hi = rect.X.Hi
clone.X.Lo = rect.X.Lo
clone.Y.Hi = rect.Y.Hi
clone.Y.Lo = rect.Y.Lo

height := clone.Y.Hi - clone.Y.Lo
yDiff := (height * by) / 2
clone.Y.Hi -= yDiff
clone.Y.Lo += yDiff

return clone
}

func GetBoundsFromAnnotMarks(annotRect r2.Rect, markRects []r2.Rect) (r2.Rect, int) {
bound := r2.EmptyRect()
boundSet := false
Expand All @@ -171,7 +187,9 @@ func GetBoundsFromAnnotMarks(annotRect r2.Rect, markRects []r2.Rect) (r2.Rect, i
continue
}

if annotRect.Intersects(mark) && IsWithinOverlapThresh(annotRect, mark) {
scaled := scaleY(annotRect, 0.6)

if scaled.Intersects(mark) {
if offset == -1 {
offset = i
}
Expand Down
42 changes: 16 additions & 26 deletions pdfutils/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ func ExpandLigatures(str string) string {
return str
}

var hyphen = regexp.MustCompile(`([a-zA-Z])- +([a-zA-Z])`)

func DeHyphen(str string) string {
return hyphen.ReplaceAllString(str, "$1$2")
}

func RemoveNul(str string) string {
return strings.Map(func(r rune) rune {
if r == unicode.ReplacementChar {
Expand Down Expand Up @@ -134,33 +140,15 @@ func GetTextByAnnotBounds(fitzDoc *fitz.Document, pageIndex int, page *model.Pdf
yAdjust = page.CropBox.Llx - page.MediaBox.Llx
}

rotated := ApplyPageRotation(page, []float64{bounds.X.Lo, bounds.Y.Lo, bounds.X.Hi, bounds.Y.Hi})

x1 := rotated[0]
y1 := rotated[1]
x2 := rotated[2]
y2 := rotated[3]

if *page.Rotate == 0 || *page.Rotate == 180 {
bHeight := rotated[3] - rotated[1]
yDiff := (bHeight * 0.6) / 2
y1 += yDiff
y2 -= yDiff
} else {
bWidth := rotated[2] - rotated[0]
xDiff := (bWidth * 0.6) / 2
x1 += xDiff
x2 -= xDiff
}
scaled := scaleY(bounds, 0.6)
rotated := ApplyPageRotation(page, []float64{scaled.X.Lo, scaled.Y.Lo, scaled.X.Hi, scaled.Y.Hi})

x1 += xAdjust
x2 += xAdjust
y1 += yAdjust
y2 += yAdjust
x1 := rotated[0] + xAdjust
x2 := rotated[2] + xAdjust

// fitz's y-axis is oriented at the top
y1 = height - y1
y2 = height - y2
y1 := height - (rotated[1] + yAdjust)
y2 := height - (rotated[3] + yAdjust)

return fitzDoc.TextByBounds(
pageIndex,
Expand All @@ -180,7 +168,9 @@ func GetFallbackText(text string, annotRect r2.Rect, markRects []r2.Rect, marks
continue
}

if annotRect.Intersects(mark) && IsWithinOverlapThresh(annotRect, mark) {
scaled := scaleY(annotRect, 0.6)

if scaled.Intersects(mark) {
if len(marks[i].Text) > 0 && marks[i].Offset > 0 && len(segment) > 0 {
prevChar := string(text[marks[i].Offset-1])

Expand Down Expand Up @@ -231,7 +221,7 @@ func GetAnnotationID(ids map[string]bool, pageIndex int, x float64, y float64, a
var nlAndSpace = regexp.MustCompile(`[\n\s]+`)

func CondenseSpaces(str string) string {
return nlAndSpace.ReplaceAllString(str, " ")
return nlAndSpace.ReplaceAllString(strings.Trim(str, " "), " ")
}

func intToRoman(number int) string {
Expand Down

0 comments on commit 56a1377

Please sign in to comment.