forked from eikek/docspell
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
604 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
100 changes: 94 additions & 6 deletions
100
modules/convert/src/main/scala/docspell/convert/Conversion.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,112 @@ | ||
package docspell.convert | ||
|
||
import java.nio.charset.StandardCharsets | ||
|
||
import fs2._ | ||
import cats.effect._ | ||
import cats.implicits._ | ||
import docspell.common._ | ||
import docspell.convert.ConversionResult.Handler | ||
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf} | ||
import docspell.convert.flexmark.Markdown | ||
import docspell.files.{ImageSize, TikaMimetype} | ||
|
||
trait Conversion[F[_]] { | ||
|
||
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] | ||
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] | ||
|
||
} | ||
|
||
object Conversion { | ||
|
||
def create[F[_]: Sync: ContextShift](cfg: ConvertConfig, blocker: Blocker, logger: Logger[F]): Resource[F, Conversion[F]] = | ||
def create[F[_]: Sync: ContextShift]( | ||
cfg: ConvertConfig, | ||
blocker: Blocker, | ||
logger: Logger[F] | ||
): Resource[F, Conversion[F]] = | ||
Resource.pure(new Conversion[F] { | ||
|
||
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] = { | ||
println(s"$cfg $blocker $logger") | ||
??? | ||
} | ||
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] = | ||
TikaMimetype.resolve(dataType, in).flatMap { | ||
case MimeType.pdf => | ||
handler.run(ConversionResult.successPdf(in)) | ||
|
||
case MimeType.html => | ||
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler) | ||
|
||
case Texts(_) => | ||
Markdown.toHtml(in, cfg.markdown).flatMap { html => | ||
val bytes = Stream | ||
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8))) | ||
.covary[F] | ||
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler) | ||
} | ||
|
||
case Images(mt) => | ||
ImageSize.get(in).flatMap { | ||
case Some(dim) => | ||
if (dim.product > cfg.maxImageSize) { | ||
logger | ||
.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *> | ||
handler.run( | ||
ConversionResult.inputMalformed( | ||
mt, | ||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize})." | ||
) | ||
) | ||
} else { | ||
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler) | ||
} | ||
|
||
case None => | ||
logger.info( | ||
s"Cannot read image when determining size for ${mt.asString}. Converting anyways." | ||
) *> | ||
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler) | ||
} | ||
|
||
case Office(_) => | ||
Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, blocker, logger)(in, handler) | ||
|
||
case mt => | ||
handler.run(ConversionResult.unsupportedFormat(mt)) | ||
} | ||
}) | ||
|
||
object Images { | ||
|
||
val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff) | ||
|
||
def unapply(m: MimeType): Option[MimeType] = | ||
Some(m).filter(all.contains) | ||
} | ||
|
||
object Texts { | ||
def unapply(m: MimeType): Option[MimeType] = | ||
Some(m).filter(_.primary == "text") | ||
} | ||
|
||
object Office { | ||
val odt = MimeType.application("vnd.oasis.opendocument.text") | ||
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet") | ||
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text") | ||
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet") | ||
val msoffice = MimeType.application("x-tika-msoffice") | ||
val ooxml = MimeType.application("x-tika-ooxml") | ||
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document") | ||
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet") | ||
val xls = MimeType.application("vnd.ms-excel") | ||
val doc = MimeType.application("msword") | ||
val rtf = MimeType.application("rtf") | ||
|
||
// without a filename, tika returns application/zip for odt/ods files, since | ||
// they are just zip files | ||
val odfContainer = MimeType.zip | ||
|
||
val all = | ||
Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer) | ||
|
||
def unapply(m: MimeType): Option[MimeType] = | ||
Some(m).filter(all.contains) | ||
} | ||
} |
53 changes: 53 additions & 0 deletions
53
modules/convert/src/main/scala/docspell/convert/ConversionResult.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
package docspell.convert | ||
|
||
import cats.data.Kleisli | ||
import fs2.Stream | ||
import docspell.common.MimeType | ||
|
||
sealed trait ConversionResult[F[_]] { | ||
|
||
def pdfData: Stream[F, Byte] | ||
|
||
} | ||
|
||
object ConversionResult { | ||
|
||
/** The conversion is done by external tools that write files to the | ||
* file system. These are temporary files and they will be deleted | ||
* once the process finishes. This handler is used to do something | ||
* relevant with the resulting files. | ||
*/ | ||
type Handler[F[_], A] = Kleisli[F, ConversionResult[F], A] | ||
|
||
def unsupportedFormat[F[_]](mime: MimeType): ConversionResult[F] = | ||
UnsupportedFormat[F](mime) | ||
|
||
def failure[F[_]](ex: Throwable): ConversionResult[F] = | ||
Failure[F](ex) | ||
|
||
def successPdf[F[_]](pdf: Stream[F, Byte]): ConversionResult[F] = | ||
SuccessPdf[F](pdf) | ||
|
||
def successPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]): ConversionResult[F] = | ||
SuccessPdfTxt[F](pdf, txt) | ||
|
||
def inputMalformed[F[_]](mimeType: MimeType, reason: String): ConversionResult[F] = | ||
InputMalformed(mimeType, reason) | ||
|
||
case class UnsupportedFormat[F[_]](mime: MimeType) extends ConversionResult[F] { | ||
val pdfData = Stream.empty | ||
} | ||
case class Failure[F[_]](ex: Throwable) extends ConversionResult[F] { | ||
val pdfData = Stream.empty | ||
} | ||
case class SuccessPdf[F[_]](pdf: Stream[F, Byte]) extends ConversionResult[F] { | ||
val pdfData = pdf | ||
} | ||
case class SuccessPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]) extends ConversionResult[F] { | ||
val pdfData = pdf | ||
} | ||
|
||
case class InputMalformed[F[_]](mimeType: MimeType, reason: String) extends ConversionResult[F] { | ||
val pdfData = Stream.empty | ||
} | ||
} |
8 changes: 7 additions & 1 deletion
8
modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,11 @@ | ||
package docspell.convert | ||
|
||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} | ||
import docspell.convert.flexmark.MarkdownConfig | ||
|
||
case class ConvertConfig(markdown: MarkdownConfig) | ||
case class ConvertConfig(chunkSize: Int, | ||
maxImageSize: Int, | ||
markdown: MarkdownConfig, | ||
wkhtmlpdf: WkHtmlPdfConfig, | ||
tesseract: TesseractConfig, | ||
unoconv: UnoconvConfig) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.