Skip to content

Commit

Permalink
Convert some files to pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
eikek committed Feb 19, 2020
1 parent 5869e2e commit 9b13497
Show file tree
Hide file tree
Showing 19 changed files with 604 additions and 97 deletions.
6 changes: 4 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ val files = project.in(file("modules/files")).
settings(
name := "docspell-files",
libraryDependencies ++=
Dependencies.tika ,
Dependencies.tika,
Test / sourceGenerators += Def.task {
val base = (Test/resourceDirectory).value
val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
Expand Down Expand Up @@ -204,6 +204,7 @@ val extract = project.in(file("modules/extract")).
name := "docspell-extract",
libraryDependencies ++=
Dependencies.fs2 ++
Dependencies.twelvemonkeys ++
Dependencies.pdfbox ++
Dependencies.poi ++
Dependencies.commonsIO ++
Expand All @@ -217,7 +218,8 @@ val convert = project.in(file("modules/convert")).
settings(
name := "docspell-convert",
libraryDependencies ++=
Dependencies.flexmark
Dependencies.flexmark ++
Dependencies.twelvemonkeys
).dependsOn(common, files % "compile->compile;test->test")

val analysis = project.in(file("modules/analysis")).
Expand Down
13 changes: 11 additions & 2 deletions modules/common/src/main/scala/docspell/common/File.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor}
import java.util.concurrent.atomic.AtomicInteger

import scala.jdk.CollectionConverters._
import fs2.Stream
import cats.implicits._
import cats.effect.{Blocker, ContextShift, Resource, Sync}
import cats.effect._

object File {

Expand Down Expand Up @@ -42,6 +43,9 @@ object File {
count.get
}

def exists[F[_]: Sync](file: Path): F[Boolean] =
Sync[F].delay(Files.exists(file))

def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)

Expand All @@ -61,6 +65,11 @@ object File {
javaList.asScala.toList.sortBy(_.getFileName.toString)
}

def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int) =
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int): Stream[F, Byte] =
fs2.io.file.readAll(file, blocker, chunkSize)

def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
readAll[F](file, blocker, 8192).
through(fs2.text.utf8Decode).
compile.foldMonoid
}
100 changes: 94 additions & 6 deletions modules/convert/src/main/scala/docspell/convert/Conversion.scala
Original file line number Diff line number Diff line change
@@ -1,24 +1,112 @@
package docspell.convert

import java.nio.charset.StandardCharsets

import fs2._
import cats.effect._
import cats.implicits._
import docspell.common._
import docspell.convert.ConversionResult.Handler
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
import docspell.convert.flexmark.Markdown
import docspell.files.{ImageSize, TikaMimetype}

trait Conversion[F[_]] {

def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]]
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]

}

object Conversion {

def create[F[_]: Sync: ContextShift](cfg: ConvertConfig, blocker: Blocker, logger: Logger[F]): Resource[F, Conversion[F]] =
def create[F[_]: Sync: ContextShift](
cfg: ConvertConfig,
blocker: Blocker,
logger: Logger[F]
): Resource[F, Conversion[F]] =
Resource.pure(new Conversion[F] {

def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] = {
println(s"$cfg $blocker $logger")
???
}
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
TikaMimetype.resolve(dataType, in).flatMap {
case MimeType.pdf =>
handler.run(ConversionResult.successPdf(in))

case MimeType.html =>
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)

case Texts(_) =>
Markdown.toHtml(in, cfg.markdown).flatMap { html =>
val bytes = Stream
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
.covary[F]
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
}

case Images(mt) =>
ImageSize.get(in).flatMap {
case Some(dim) =>
if (dim.product > cfg.maxImageSize) {
logger
.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
handler.run(
ConversionResult.inputMalformed(
mt,
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize})."
)
)
} else {
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
}

case None =>
logger.info(
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
) *>
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
}

case Office(_) =>
Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, blocker, logger)(in, handler)

case mt =>
handler.run(ConversionResult.unsupportedFormat(mt))
}
})

object Images {

val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff)

def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(all.contains)
}

object Texts {
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(_.primary == "text")
}

object Office {
val odt = MimeType.application("vnd.oasis.opendocument.text")
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
val msoffice = MimeType.application("x-tika-msoffice")
val ooxml = MimeType.application("x-tika-ooxml")
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val rtf = MimeType.application("rtf")

// without a filename, tika returns application/zip for odt/ods files, since
// they are just zip files
val odfContainer = MimeType.zip

val all =
Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)

def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(all.contains)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package docspell.convert

import cats.data.Kleisli
import fs2.Stream
import docspell.common.MimeType

sealed trait ConversionResult[F[_]] {

def pdfData: Stream[F, Byte]

}

object ConversionResult {

/** The conversion is done by external tools that write files to the
* file system. These are temporary files and they will be deleted
* once the process finishes. This handler is used to do something
* relevant with the resulting files.
*/
type Handler[F[_], A] = Kleisli[F, ConversionResult[F], A]

def unsupportedFormat[F[_]](mime: MimeType): ConversionResult[F] =
UnsupportedFormat[F](mime)

def failure[F[_]](ex: Throwable): ConversionResult[F] =
Failure[F](ex)

def successPdf[F[_]](pdf: Stream[F, Byte]): ConversionResult[F] =
SuccessPdf[F](pdf)

def successPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]): ConversionResult[F] =
SuccessPdfTxt[F](pdf, txt)

def inputMalformed[F[_]](mimeType: MimeType, reason: String): ConversionResult[F] =
InputMalformed(mimeType, reason)

case class UnsupportedFormat[F[_]](mime: MimeType) extends ConversionResult[F] {
val pdfData = Stream.empty
}
case class Failure[F[_]](ex: Throwable) extends ConversionResult[F] {
val pdfData = Stream.empty
}
case class SuccessPdf[F[_]](pdf: Stream[F, Byte]) extends ConversionResult[F] {
val pdfData = pdf
}
case class SuccessPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]) extends ConversionResult[F] {
val pdfData = pdf
}

case class InputMalformed[F[_]](mimeType: MimeType, reason: String) extends ConversionResult[F] {
val pdfData = Stream.empty
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
package docspell.convert

import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.flexmark.MarkdownConfig

case class ConvertConfig(markdown: MarkdownConfig)
case class ConvertConfig(chunkSize: Int,
maxImageSize: Int,
markdown: MarkdownConfig,
wkhtmlpdf: WkHtmlPdfConfig,
tesseract: TesseractConfig,
unoconv: UnoconvConfig)
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,34 @@ package docspell.convert.extern

import java.nio.file.Path

import cats.implicits._
import cats.effect._
import fs2.{Pipe, Stream}
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt}

object ExternConv {
private[extern] object ExternConv {

def toPDF[F[_]: Sync: ContextShift](
def toPDF[F[_]: Sync: ContextShift, A](
name: String,
cmdCfg: SystemCommand.Config,
wd: Path,
chunkSize: Int,
useStdin: Boolean,
blocker: Blocker,
logger: Logger[F]
): Pipe[F, Byte, Byte] =
in =>
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
logger: Logger[F],
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
val inFile = dir.resolve("infile").toAbsolutePath.normalize
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
val sysCfg =
cmdCfg.replace(
Map("{{outfile}}" -> out.toString) ++
Map(
"{{outfile}}" -> out.toString
) ++
(if (!useStdin) Map("{{infile}}" -> inFile.toString)
else Map.empty)
else Map.empty)
)

val createInput: Pipe[F, Byte, Unit] =
Expand All @@ -35,41 +39,66 @@ object ExternConv {
in.through(createInput).flatMap { _ =>
SystemCommand
.execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
.flatMap(result =>
logResult(name, result, logger) ++ readResult[F](
out,
result,
blocker,
chunkSize,
logger
)
.evalMap(result =>
logResult(name, result, logger).
flatMap(_ => reader(out, result)).
flatMap(handler.run)
)
}
}
}.compile.lastOrError

def readResult[F[_]: Sync: ContextShift](
out: Path,
result: SystemCommand.Result,
blocker: Blocker,
chunkSize: Int,
logger: Logger[F]
): Stream[F, Byte] =
Stream.eval(File.existsNonEmpty[F](out)).flatMap {
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
File.existsNonEmpty[F](out).flatMap {
case true =>
if (result.rc == 0) File.readAll(out, blocker, chunkSize)
if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
else
Stream
.eval(logger.warn(s"Command not successful (rc=${result.rc}), but file exists."))
.drain ++
File.readAll(out, blocker, chunkSize)
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]

case false =>
ConversionResult.failure[F](
new Exception(s"Command result=${result.rc}. No output file found.")
).pure[F]
}

def readResultTesseract[F[_]: Sync: ContextShift](
outPrefix: String,
blocker: Blocker,
chunkSize: Int,
logger: Logger[F]
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = {
val outPdf = out.resolveSibling(s"$outPrefix.pdf")
File.existsNonEmpty[F](outPdf).flatMap {
case true =>
val outTxt = out.resolveSibling(s"$outPrefix.txt")
File.exists(outTxt).flatMap(txtExists => {
val pdfData = File.readAll(out, blocker, chunkSize)
if (result.rc == 0) {
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F]
else successPdf(pdfData).pure[F]
} else {
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
successPdf(pdfData).pure[F]
}
})

case false =>
Stream.raiseError[F](
ConversionResult.failure[F](
new Exception(s"Command result=${result.rc}. No output file found.")
)
).pure[F]
}
}

private def storeDataToFile[F[_]: Sync: ContextShift](name: String, blocker: Blocker, logger: Logger[F], inFile: Path): Pipe[F, Byte, Unit] =
private def storeDataToFile[F[_]: Sync: ContextShift](
name: String,
blocker: Blocker,
logger: Logger[F],
inFile: Path
): Pipe[F, Byte, Unit] =
in =>
Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
Stream.eval(storeFile(in, inFile, blocker))
Expand All @@ -78,12 +107,12 @@ object ExternConv {
name: String,
result: SystemCommand.Result,
logger: Logger[F]
): Stream[F, Nothing] =
Stream.eval(logger.debug(s"$name stdout: ${result.stdout}")).drain ++
Stream.eval(logger.debug(s"$name stderr: ${result.stderr}")).drain
): F[Unit] =
logger.debug(s"$name stdout: ${result.stdout}") *>
logger.debug(s"$name stderr: ${result.stderr}")

private def storeFile[F[_]: Sync: ContextShift](
in: Stream[F, Byte],
in: Stream[F, Byte],
target: Path,
blocker: Blocker
): F[Unit] =
Expand Down
Loading

0 comments on commit 9b13497

Please sign in to comment.