Merge pull request allenai#44 from chrisc36/master

Removes allenai common dependency and publishing info
seele1917 · Oct 18, 2021 · 5752195 · 5752195
2 parents 217b0da + 85a6bfd
commit 5752195
Show file tree

Hide file tree

Showing 18 changed files with 211 additions and 88 deletions.
diff --git a/README.md b/README.md
@@ -29,14 +29,7 @@ subfigures, the returned figure will include all the subfigures. If a table or f
 titles or comments, those elements will be included in the figure.
 
 ### Installation
-PDFFigures2 is published to bintray at https://bintray.com/allenai/maven. 
-To install it you will need an appropriate resolver like:
-
-`resolvers += Resolver.bintrayRepo("allenai", "maven")`
-
-Then include
-
-`libraryDependencies += "org.allenai" %%  "pdffigures2" % "0.0.11"`
+Clone the repo and then run with sbt.
 
 For licensing reasons, PDFFigures2 does not include libraries for some image formats. Without these
 libraries, PDFFigures2 cannot process PDFs that contain images in these formats. If you have no
@@ -60,12 +53,12 @@ To run on a PDF and get a preview of the results use:
 To get a visualization of how the PDF was parsed:
 
 `sbt "run-main
-org.allenai.pdffigures2.FigureExtractorVisualizationCli /path/to/pdf" -r`
+org.allenai.pdffigures2.FigureExtractorVisualizationCli /path/to/pdf -r"`
 
 To get a visualization of all the intermediate steps:
 
 `sbt "run-main
-org.allenai.pdffigures2.FigureExtractorVisualizationCli /path/to/pdf" -s`
+org.allenai.pdffigures2.FigureExtractorVisualizationCli /path/to/pdf -s"`
 
 To run on lots of PDFs while saving the images, figure objects, and run statistics:
 
@@ -179,22 +172,6 @@ is not part of a figure. This helps avoid false positives induced by including e
 from the header in figures on the first page, but there is probably a way to
 relax this assumption to resolve this issue.
 
-## Releasing new versions
-
-This project releases to BinTray.  To make a release:
-
-1. Pull the latest code on the master branch that you want to release
-1. Edit `build.sbt` to remove "-SNAPSHOT" from the current version
-1. Create a pull request if desired or push to master if you are only changing the version
-1. Tag the release `git tag -a vX.Y.Z -m "Release X.Y.Z"` replacing X.Y.Z with the correct version
-1. Push the tag back to origin `git push origin vX.Y.Z`
-1. Release the build on Bintray `sbt +publish` (the "+" is required to cross-compile)
-1. Verify publication [on bintray.com](https://bintray.com/allenai/maven)
-1. Bump the version in `build.sbt` on master (and push!) with X.Y.Z+1-SNAPSHOT (e.g., 2.5.1
--SNAPSHOT after releasing 2.5.0)
-
-If you make a mistake you can rollback the release with `sbt bintrayUnpublish` and retag the
- version to a different commit as necessary.
 
 ## Contact
-Christopher Clark, [email protected]
+Christopher Clark, [email protected]
diff --git a/build.sbt b/build.sbt
@@ -11,10 +11,6 @@ ThisBuild / version      := "0.1.0"
 lazy val projectSettings = Seq(
   name := "pdffigures2",
   crossScalaVersions := supportedScalaVersions,
-  resolvers ++= Seq(
-    Resolver.bintrayRepo("allenai", "maven"),
-    Resolver.jcenterRepo
-  ),
   publishMavenStyle := true,
   publishArtifact in Test := false,
   pomIncludeRepository := { _ => false },
@@ -27,8 +23,6 @@ lazy val projectSettings = Seq(
   bintrayOrganization := Some("allenai"),
   bintrayRepository := "maven",
   libraryDependencies ++= Seq(
-    "org.allenai.common" %% "common-core" % "2.0.0",
-    "org.allenai.common" %% "common-testkit" % "2.0.0",
     "io.spray" %% "spray-json" % "1.3.5",
     "com.github.scopt" %% "scopt" % "3.7.1",
     "ch.qos.logback" % "logback-classic" % "1.1.7",

diff --git a/src/main/resources/application.conf b/src/main/resources/application.conf
diff --git a/src/main/scala/org/allenai/pdffigures2/CaptionBuilder.scala b/src/main/scala/org/allenai/pdffigures2/CaptionBuilder.scala
@@ -1,7 +1,5 @@
 package org.allenai.pdffigures2
 
-import org.allenai.common.Logging
-
 import org.apache.pdfbox.pdmodel.font.PDFont
 
 object CaptionBuilder extends Logging {

diff --git a/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala b/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala
@@ -1,8 +1,7 @@
 package org.allenai.pdffigures2
 
-import org.allenai.common.Logging
-
 import org.apache.pdfbox.pdmodel.font.PDFont
+import FigureType._
 
 case class CaptionStart(
   header: String,

diff --git a/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala b/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala
@@ -1,7 +1,5 @@
 package org.allenai.pdffigures2
 
-import org.allenai.common.Logging
-
 import org.apache.pdfbox.pdmodel.font.PDFont
 
 import scala.collection.{ immutable, mutable }

diff --git a/src/main/scala/org/allenai/pdffigures2/Figure.scala b/src/main/scala/org/allenai/pdffigures2/Figure.scala
@@ -1,14 +1,13 @@
 package org.allenai.pdffigures2
 
 import java.awt.image.BufferedImage
-import org.allenai.common.{ Enum, EnumCompanion }
 
-sealed abstract class FigureType(id: String) extends Enum[FigureType]
-object FigureType extends EnumCompanion[FigureType] {
-  case object Table extends FigureType("Table")
-  case object Figure extends FigureType("Figure")
-  register(Figure, Table)
+
+object FigureType extends Enumeration {
+  type FigureType = Value
+  val Table, Figure = Value
 }
+import FigureType._
 
 case class CaptionParagraph(name: String, figType: FigureType, page: Int, paragraph: Paragraph) {
   def boundary: Box = paragraph.boundary

diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala
@@ -1,7 +1,5 @@
 package org.allenai.pdffigures2
 
-import org.allenai.common.Config._
-import org.allenai.common.Logging
 import org.allenai.pdffigures2.FigureExtractor.{
   Document,
   DocumentContent,
@@ -265,12 +263,29 @@ object FigureExtractor {
   class OcredPdfException(message: String = null, cause: Throwable = null)
       extends RuntimeException(message, cause)
 
-  val conf = ConfigFactory.load()
-  val allowOcr = conf[Boolean]("allowOcr")
-  val detectSectionTitlesFirst = conf[Boolean]("detectSectionTitlesFirst")
-  val rebuildParagraphs = conf[Boolean]("rebuildParagraphs")
-  val ignoreWhiteGraphics = conf[Boolean]("ignoreWhiteGraphics")
-  val cleanRasterizedFigureRegions = conf[Boolean]("cleanRasterizedFigureRegions")
+  // Whether to parse papers that appear to be OCRed, this can be slow and be warned: we tend to get
+  // worse results on these PDFs
+  val allowOcr = false
+
+  // Run the section titles before detecting the figures, recommended to keep this
+  // off since extracting figures can remove misleading pieces of text (like figure titles)
+  // the section title algorithm might fail on.
+  val detectSectionTitlesFirst = false
+
+  // Attempt to rebuild the paragraph returned from PDFBox, which can improve the
+  // paragraphing grouped returned in some cases
+  val rebuildParagraphs = true
+
+  // Skip colorless or 'empty' graphic when extracting graphic regions, this improves
+  // accuracy. However, if the extracted figures are not being rendered to disk (so only
+  // the metedata is being extracted), turning this off can increase processing speed
+  // non-trivially since the processor can skip reading color related data from ther PDF.
+  val ignoreWhiteGraphics = true
+
+  // Perform some post-processing cleanup on the extracted figures after rendering them,
+  // this can help alleviate minor issue with text in the extracted figure being clipped
+  // at the borders of the figure.
+  val cleanRasterizedFigureRegions = true
 
   def apply(): FigureExtractor = {
     new FigureExtractor(

diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala
@@ -4,7 +4,6 @@ import java.io.File
 import java.util.concurrent.atomic.AtomicInteger
 
 import ch.qos.logback.classic.{ Level, Logger }
-import org.allenai.common.Logging
 import org.allenai.pdffigures2.FigureExtractor.DocumentWithSavedFigures
 import org.allenai.pdffigures2.JsonProtocol._
 import org.apache.pdfbox.pdmodel.PDDocument

diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala
@@ -1,6 +1,5 @@
 package org.allenai.pdffigures2
 
-import org.allenai.common.Logging
 import org.apache.pdfbox.io.MemoryUsageSetting
 
 import org.apache.pdfbox.pdmodel.PDDocument

diff --git a/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala b/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala
@@ -1,7 +1,5 @@
 package org.allenai.pdffigures2
 
-import org.allenai.common.Logging
-
 object FormattingTextExtractor extends Logging {
 
   // Matches "Abstract. This Paper...", "Abstract", "Abstract-In this paper"

diff --git a/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala b/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala
@@ -1,6 +1,5 @@
 package org.allenai.pdffigures2
 
-import org.allenai.common.Logging
 import org.allenai.pdffigures2.FigureExtractor.OcredPdfException
 
 import org.apache.pdfbox.pdmodel.PDDocument

diff --git a/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala b/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala
@@ -5,8 +5,22 @@ import org.allenai.pdffigures2.SectionedTextBuilder.{ DocumentSection, PdfText }
 
 import spray.json._
 
+// From https://github.com/spray/spray-json/issues/200
+// to support enum -> json conversion
+class EnumJsonConverter[T <: scala.Enumeration](enu: T) extends RootJsonFormat[T#Value] {
+  override def write(obj: T#Value): JsValue = JsString(obj.toString)
+
+  override def read(json: JsValue): T#Value = {
+    json match {
+      case JsString(txt) => enu.withName(txt)
+      case somethingElse => throw DeserializationException(s"Expected a value from enum $enu instead of $somethingElse")
+    }
+  }
+}
+
 trait JsonProtocol extends DefaultJsonProtocol {
   // JSON formats so we can write Figures/Captions/Documents to disk
+  implicit val enumConverter = new EnumJsonConverter(FigureType)
   implicit val boxFormat = jsonFormat4(Box.apply)
   implicit val captionFormat = jsonFormat5(Caption.apply)
   implicit val figureFormat = jsonFormat7(Figure.apply)