From ee8b530c9498a367b09a36457cf2e2ece8559e4b Mon Sep 17 00:00:00 2001 From: Stephane Royer Date: Tue, 25 Jan 2022 01:59:10 +0100 Subject: [PATCH] rehandle do and add pdf & bloomberg --- ...illave.EntityFrameworkCoreExtension.csproj | 10 +- .../Paillave.Etl.Autofac.csproj | 4 +- .../BloombergValuesProvider.cs | 13 +- .../Paillave.Etl.Bloomberg.csproj | 33 ++++ .../Paillave.Etl.Dropbox.csproj | 6 +- .../Paillave.Etl.EntityFrameworkCore.csproj | 8 +- .../Paillave.Etl.ExcelFile.csproj | 2 +- .../Paillave.Etl.ExecutionToolkit.csproj | 6 +- .../Paillave.Etl.FileSystem.csproj | 2 +- ...ave.Etl.FromConfigurationConnectors.csproj | 4 +- src/Paillave.Etl.Ftp/Paillave.Etl.Ftp.csproj | 6 +- .../Paillave.Etl.Mail.csproj | 8 +- src/Paillave.Etl.Pdf/Paillave.Etl.Pdf.csproj | 36 ++++ src/Paillave.Etl.Pdf/PdfFile.Stream.ex.cs | 11 ++ src/Paillave.Etl.Pdf/PdfRowsValuesProvider.cs | 74 ++++++++ .../Paillave.Etl.Sftp.csproj | 4 +- .../Paillave.Etl.SqlServer.csproj | 4 +- .../Paillave.Etl.Tests.csproj | 6 +- .../Paillave.Etl.TextFile.csproj | 2 +- .../Paillave.Etl.XmlFile.csproj | 2 +- src/Paillave.Etl.Zip/Paillave.Etl.Zip.csproj | 6 +- src/Paillave.Etl.sln | 45 ++++- src/Paillave.Etl/Extensions/Do.Stream.ex.cs | 147 +++++++-------- .../Extensions/StreamNodes/DoStreamNode.cs | 141 +++++---------- src/Paillave.Etl/Paillave.Etl.csproj | 2 +- .../ApproximateEqualityComparer.cs | 12 ++ src/Paillave.Pdf/EnumerationEx.cs | 45 +++++ src/Paillave.Pdf/Grid.cs | 162 +++++++++++++++++ src/Paillave.Pdf/GridExtractor.cs | 143 +++++++++++++++ src/Paillave.Pdf/GridLine.cs | 51 ++++++ src/Paillave.Pdf/IBounds.cs | 10 ++ src/Paillave.Pdf/IPdfProcessor.cs | 11 ++ src/Paillave.Pdf/LinesOfWords.cs | 62 +++++++ src/Paillave.Pdf/Paillave.Pdf.csproj | 31 ++++ src/Paillave.Pdf/PdfBlockLine.cs | 8 + src/Paillave.Pdf/PdfReader.cs | 86 +++++++++ src/Paillave.Pdf/PointEqualityComparer.cs | 18 ++ src/Paillave.Pdf/StructureReader.cs | 111 ++++++++++++ src/Paillave.Pdf/SvgBuilder.cs | 76 ++++++++ src/Paillave.Pdf/TextTemplate.cs | 169 ++++++++++++++++++ src/Paillave.Pdf/Tools.cs | 17 ++ .../BlogTutorial/BlogTutorial.csproj | 6 +- .../Paillave.Etl.Samples.csproj | 6 +- src/Tutorials/SimpleTutorial/Program.cs | 5 +- .../SimpleTutorial/SimpleTutorial.csproj | 7 +- 45 files changed, 1381 insertions(+), 237 deletions(-) rename src/{Paillave.Etl.TextFile => Paillave.Etl.Bloomberg}/BloombergValuesProvider.cs (93%) create mode 100644 src/Paillave.Etl.Bloomberg/Paillave.Etl.Bloomberg.csproj create mode 100644 src/Paillave.Etl.Pdf/Paillave.Etl.Pdf.csproj create mode 100644 src/Paillave.Etl.Pdf/PdfFile.Stream.ex.cs create mode 100644 src/Paillave.Etl.Pdf/PdfRowsValuesProvider.cs create mode 100644 src/Paillave.Pdf/ApproximateEqualityComparer.cs create mode 100644 src/Paillave.Pdf/EnumerationEx.cs create mode 100644 src/Paillave.Pdf/Grid.cs create mode 100644 src/Paillave.Pdf/GridExtractor.cs create mode 100644 src/Paillave.Pdf/GridLine.cs create mode 100644 src/Paillave.Pdf/IBounds.cs create mode 100644 src/Paillave.Pdf/IPdfProcessor.cs create mode 100644 src/Paillave.Pdf/LinesOfWords.cs create mode 100644 src/Paillave.Pdf/Paillave.Pdf.csproj create mode 100644 src/Paillave.Pdf/PdfBlockLine.cs create mode 100644 src/Paillave.Pdf/PdfReader.cs create mode 100644 src/Paillave.Pdf/PointEqualityComparer.cs create mode 100644 src/Paillave.Pdf/StructureReader.cs create mode 100644 src/Paillave.Pdf/SvgBuilder.cs create mode 100644 src/Paillave.Pdf/TextTemplate.cs create mode 100644 src/Paillave.Pdf/Tools.cs diff --git a/src/Paillave.EntityFrameworkCoreExtension/Paillave.EntityFrameworkCoreExtension.csproj b/src/Paillave.EntityFrameworkCoreExtension/Paillave.EntityFrameworkCoreExtension.csproj index bae0141d..31cfd31f 100644 --- a/src/Paillave.EntityFrameworkCoreExtension/Paillave.EntityFrameworkCoreExtension.csproj +++ b/src/Paillave.EntityFrameworkCoreExtension/Paillave.EntityFrameworkCoreExtension.csproj @@ -18,14 +18,14 @@ - net5.0 + net6.0 latest - - - - + + + + diff --git a/src/Paillave.Etl.Autofac/Paillave.Etl.Autofac.csproj b/src/Paillave.Etl.Autofac/Paillave.Etl.Autofac.csproj index 9703e88f..9c9d1ffb 100644 --- a/src/Paillave.Etl.Autofac/Paillave.Etl.Autofac.csproj +++ b/src/Paillave.Etl.Autofac/Paillave.Etl.Autofac.csproj @@ -19,12 +19,12 @@ - net5.0 + net6.0 latest - + diff --git a/src/Paillave.Etl.TextFile/BloombergValuesProvider.cs b/src/Paillave.Etl.Bloomberg/BloombergValuesProvider.cs similarity index 93% rename from src/Paillave.Etl.TextFile/BloombergValuesProvider.cs rename to src/Paillave.Etl.Bloomberg/BloombergValuesProvider.cs index 387c2f2b..02367e1a 100644 --- a/src/Paillave.Etl.TextFile/BloombergValuesProvider.cs +++ b/src/Paillave.Etl.Bloomberg/BloombergValuesProvider.cs @@ -7,8 +7,11 @@ using System.Text; using System.Text.RegularExpressions; using System.Threading; +using Paillave.Etl.TextFile; +using System.Linq.Expressions; +using Paillave.Etl.Core.Mapping; -namespace Paillave.Etl.TextFile +namespace Paillave.Etl.Bloomberg { public class BloombergResult { @@ -23,11 +26,11 @@ public class BloombergValuesProviderArgs } public static class BloombergValuesProvider { - public static BloombergValuesProvider Create(FlatFileDefinition mapping) + public static BloombergValuesProvider Create(Expression> expression) { return new BloombergValuesProvider(new BloombergValuesProviderArgs { - Mapping = mapping, + Mapping = new FlatFileDefinition().WithMap(expression).IsColumnSeparated('|'), }); } } @@ -35,7 +38,7 @@ public class BloombergValuesProvider : ValuesProviderBase _args; public BloombergValuesProvider(BloombergValuesProviderArgs args) => _args = args; - public override ProcessImpact PerformanceImpact => ProcessImpact.Heavy; + public override ProcessImpact PerformanceImpact => ProcessImpact.Average; public override ProcessImpact MemoryFootPrint => ProcessImpact.Light; private enum FileReadState { @@ -71,7 +74,7 @@ public override void PushValues(IFileValue input, Action + + Paillave.EtlNet.Bloomberg + 2.0.5 + Stéphane Royer + + true + MIT + https://paillave.github.io/Etl.Net/ + ETL .net core SSIS reactive text file bloomberg + ETL.net bloomberg files extensions + NugetIcon.png + Extensions for Etl.Net to read bloomberg response files + latest + README.md + + + + + + + + net6.0 + + + + + + + + + + diff --git a/src/Paillave.Etl.Dropbox/Paillave.Etl.Dropbox.csproj b/src/Paillave.Etl.Dropbox/Paillave.Etl.Dropbox.csproj index 93ff7792..cef9b356 100644 --- a/src/Paillave.Etl.Dropbox/Paillave.Etl.Dropbox.csproj +++ b/src/Paillave.Etl.Dropbox/Paillave.Etl.Dropbox.csproj @@ -20,12 +20,12 @@ - - + + - net5.0 + net6.0 diff --git a/src/Paillave.Etl.EntityFrameworkCore/Paillave.Etl.EntityFrameworkCore.csproj b/src/Paillave.Etl.EntityFrameworkCore/Paillave.Etl.EntityFrameworkCore.csproj index 3ea58e72..204b7eb0 100644 --- a/src/Paillave.Etl.EntityFrameworkCore/Paillave.Etl.EntityFrameworkCore.csproj +++ b/src/Paillave.Etl.EntityFrameworkCore/Paillave.Etl.EntityFrameworkCore.csproj @@ -20,15 +20,15 @@ - net5.0 + net6.0 latest - - - + + + diff --git a/src/Paillave.Etl.ExcelFile/Paillave.Etl.ExcelFile.csproj b/src/Paillave.Etl.ExcelFile/Paillave.Etl.ExcelFile.csproj index c609702b..501e354d 100644 --- a/src/Paillave.Etl.ExcelFile/Paillave.Etl.ExcelFile.csproj +++ b/src/Paillave.Etl.ExcelFile/Paillave.Etl.ExcelFile.csproj @@ -20,7 +20,7 @@ - net5.0 + net6.0 diff --git a/src/Paillave.Etl.ExecutionToolkit/Paillave.Etl.ExecutionToolkit.csproj b/src/Paillave.Etl.ExecutionToolkit/Paillave.Etl.ExecutionToolkit.csproj index 4e7e8e46..5e4d020c 100644 --- a/src/Paillave.Etl.ExecutionToolkit/Paillave.Etl.ExecutionToolkit.csproj +++ b/src/Paillave.Etl.ExecutionToolkit/Paillave.Etl.ExecutionToolkit.csproj @@ -18,7 +18,7 @@ - net5.0 + net6.0 latest @@ -47,8 +47,8 @@ - - + + diff --git a/src/Paillave.Etl.FileSystem/Paillave.Etl.FileSystem.csproj b/src/Paillave.Etl.FileSystem/Paillave.Etl.FileSystem.csproj index eb1ebb6e..1f03fd3c 100644 --- a/src/Paillave.Etl.FileSystem/Paillave.Etl.FileSystem.csproj +++ b/src/Paillave.Etl.FileSystem/Paillave.Etl.FileSystem.csproj @@ -20,7 +20,7 @@ - net5.0 + net6.0 diff --git a/src/Paillave.Etl.FromConfigurationConnectors/Paillave.Etl.FromConfigurationConnectors.csproj b/src/Paillave.Etl.FromConfigurationConnectors/Paillave.Etl.FromConfigurationConnectors.csproj index e77e8a16..bee44bf0 100644 --- a/src/Paillave.Etl.FromConfigurationConnectors/Paillave.Etl.FromConfigurationConnectors.csproj +++ b/src/Paillave.Etl.FromConfigurationConnectors/Paillave.Etl.FromConfigurationConnectors.csproj @@ -20,7 +20,7 @@ - net5.0 + net6.0 @@ -31,7 +31,7 @@ - + diff --git a/src/Paillave.Etl.Ftp/Paillave.Etl.Ftp.csproj b/src/Paillave.Etl.Ftp/Paillave.Etl.Ftp.csproj index 230b4e98..61369cdf 100644 --- a/src/Paillave.Etl.Ftp/Paillave.Etl.Ftp.csproj +++ b/src/Paillave.Etl.Ftp/Paillave.Etl.Ftp.csproj @@ -20,7 +20,7 @@ - net5.0 + net6.0 @@ -28,8 +28,8 @@ - - + + diff --git a/src/Paillave.Etl.Mail/Paillave.Etl.Mail.csproj b/src/Paillave.Etl.Mail/Paillave.Etl.Mail.csproj index 70b33ef8..df990882 100644 --- a/src/Paillave.Etl.Mail/Paillave.Etl.Mail.csproj +++ b/src/Paillave.Etl.Mail/Paillave.Etl.Mail.csproj @@ -20,9 +20,9 @@ - - - + + + runtime; build; native; contentfiles; analyzers; buildtransitive all @@ -30,7 +30,7 @@ - net5.0 + net6.0 diff --git a/src/Paillave.Etl.Pdf/Paillave.Etl.Pdf.csproj b/src/Paillave.Etl.Pdf/Paillave.Etl.Pdf.csproj new file mode 100644 index 00000000..909b6bb7 --- /dev/null +++ b/src/Paillave.Etl.Pdf/Paillave.Etl.Pdf.csproj @@ -0,0 +1,36 @@ + + + Paillave.EtlNet.Pdf + 2.0.5 + Stéphane Royer + + true + MIT + https://paillave.github.io/Etl.Net/ + ETL .net core SSIS reactive Pdf file + ETL.net PDF files extensions + NugetIcon.png + Pdf files extensions for Etl.Net + +extensions for Etl.Net to deal with PDF files + + + + + + + + net6.0 + latest + + + + + + + + + + + + diff --git a/src/Paillave.Etl.Pdf/PdfFile.Stream.ex.cs b/src/Paillave.Etl.Pdf/PdfFile.Stream.ex.cs new file mode 100644 index 00000000..5e50717a --- /dev/null +++ b/src/Paillave.Etl.Pdf/PdfFile.Stream.ex.cs @@ -0,0 +1,11 @@ +using Paillave.Etl.Core; +using System; + +namespace Paillave.Etl.Pdf +{ + public static class PdfFileEx + { + public static IStream CrossApplyPdfContent(this IStream stream, string name, Func argsBuilder, bool noParallelisation = false) + => stream.CrossApply(name, new PdfRowsValuesProvider(argsBuilder(new PdfRowsValuesProviderArgs())), noParallelisation); + } +} diff --git a/src/Paillave.Etl.Pdf/PdfRowsValuesProvider.cs b/src/Paillave.Etl.Pdf/PdfRowsValuesProvider.cs new file mode 100644 index 00000000..3b55ad37 --- /dev/null +++ b/src/Paillave.Etl.Pdf/PdfRowsValuesProvider.cs @@ -0,0 +1,74 @@ +using System; +using Paillave.Etl.Core; +using System.Collections.Generic; +using System.Threading; +// using Paillave.Etl.ValuesProviders; +using Paillave.Pdf; + +namespace Paillave.Etl.Pdf +{ + public class PdfRowsValuesProviderArgs + { + public IList PatternsToIgnore { get; } = new List(); + public IList HeadersSetups { get; } = new List(); + public PdfRowsValuesProviderArgs AddHeadersSetup(HeadersSetup headersSetup) + { + this.HeadersSetups.Add(headersSetup); + return this; + } + public PdfRowsValuesProviderArgs AddIgnore(Func templateBuilder) + { + this.PatternsToIgnore.Add(templateBuilder(new TextTemplate())); + return this; + } + } + public abstract class PdfContent + { + protected PdfContent(List section, int pageNumber, IFileValue fileValue) + => (Section, PageNumber, FileValue) = (section, pageNumber, fileValue); + public List Section { get; } + public int PageNumber { get; } + public IFileValue FileValue { get; } + } + public class PdfHeader : PdfContent + { + public PdfHeader(IFileValue fileValue, List section, int pageNumber) : base(section, pageNumber, fileValue) { } + } + public class PdfTable : PdfContent + { + public List>> Table { get; } + public PdfTable(IFileValue fileValue, List section, int pageNumber, List>> table) : base(section, pageNumber, fileValue) => (Table) = (table); + } + public class PdfTextLine : PdfContent + { + public string Text { get; } + public int LineNumber { get; } + public PdfTextLine(IFileValue fileValue, List section, int pageNumber, int lineNumber, string text) : base(section, pageNumber, fileValue) => (Text, LineNumber) = (text, lineNumber); + } + public class PdfRowsValuesProvider : ValuesProviderBase + { + private readonly PdfRowsValuesProviderArgs _args; + public PdfRowsValuesProvider(PdfRowsValuesProviderArgs args) => _args = args; + public override ProcessImpact PerformanceImpact => ProcessImpact.Heavy; + public override ProcessImpact MemoryFootPrint => ProcessImpact.Heavy; + public override void PushValues(IFileValue input, Action push, CancellationToken cancellationToken, IDependencyResolver resolver, IInvoker invoker) + { + var stream = input.GetContent(); + stream.Seek(0, System.IO.SeekOrigin.Begin); + using (var pdfReader = new PdfReader(stream, this._args.PatternsToIgnore, this._args.HeadersSetups)) + pdfReader.Read(new PdfProcessor(push, input)); + } + private class PdfProcessor : IPdfProcessor + { + private readonly Action _push; + private readonly IFileValue _fileValue; + public PdfProcessor(Action push, IFileValue fileValue) => (_push, _fileValue) = (push, fileValue); + public void ProcessLine(string text, int pageNumber, int lineNumber, int lineNumberInParagraph, int lineNumberInPage, List section) + => _push(new PdfTextLine(_fileValue, section, pageNumber, lineNumber, text)); + public void ProcessTable(List>> table, int pageNumber, List section) + => _push(new PdfTable(_fileValue, section, pageNumber, table)); + public void ProcessHeader(List section, int pageNumber) + => _push(new PdfHeader(_fileValue, section, pageNumber)); + } + } +} diff --git a/src/Paillave.Etl.Sftp/Paillave.Etl.Sftp.csproj b/src/Paillave.Etl.Sftp/Paillave.Etl.Sftp.csproj index 89336d99..9ddcac79 100644 --- a/src/Paillave.Etl.Sftp/Paillave.Etl.Sftp.csproj +++ b/src/Paillave.Etl.Sftp/Paillave.Etl.Sftp.csproj @@ -20,12 +20,12 @@ - + - net5.0 + net6.0 diff --git a/src/Paillave.Etl.SqlServer/Paillave.Etl.SqlServer.csproj b/src/Paillave.Etl.SqlServer/Paillave.Etl.SqlServer.csproj index cf8b2d02..de87f4cd 100644 --- a/src/Paillave.Etl.SqlServer/Paillave.Etl.SqlServer.csproj +++ b/src/Paillave.Etl.SqlServer/Paillave.Etl.SqlServer.csproj @@ -20,7 +20,7 @@ - net5.0 + net6.0 @@ -28,7 +28,7 @@ - + diff --git a/src/Paillave.Etl.Tests/Paillave.Etl.Tests.csproj b/src/Paillave.Etl.Tests/Paillave.Etl.Tests.csproj index b95ab2e6..84cc8d39 100644 --- a/src/Paillave.Etl.Tests/Paillave.Etl.Tests.csproj +++ b/src/Paillave.Etl.Tests/Paillave.Etl.Tests.csproj @@ -1,17 +1,17 @@ - net5.0 + net6.0 latest false - + - + diff --git a/src/Paillave.Etl.TextFile/Paillave.Etl.TextFile.csproj b/src/Paillave.Etl.TextFile/Paillave.Etl.TextFile.csproj index d7d5751c..0ac20b96 100644 --- a/src/Paillave.Etl.TextFile/Paillave.Etl.TextFile.csproj +++ b/src/Paillave.Etl.TextFile/Paillave.Etl.TextFile.csproj @@ -20,7 +20,7 @@ - net5.0 + net6.0 diff --git a/src/Paillave.Etl.XmlFile/Paillave.Etl.XmlFile.csproj b/src/Paillave.Etl.XmlFile/Paillave.Etl.XmlFile.csproj index fe249c46..3cd024a1 100644 --- a/src/Paillave.Etl.XmlFile/Paillave.Etl.XmlFile.csproj +++ b/src/Paillave.Etl.XmlFile/Paillave.Etl.XmlFile.csproj @@ -20,7 +20,7 @@ - net5.0 + net6.0 diff --git a/src/Paillave.Etl.Zip/Paillave.Etl.Zip.csproj b/src/Paillave.Etl.Zip/Paillave.Etl.Zip.csproj index 4bb77df1..dc903009 100644 --- a/src/Paillave.Etl.Zip/Paillave.Etl.Zip.csproj +++ b/src/Paillave.Etl.Zip/Paillave.Etl.Zip.csproj @@ -20,7 +20,7 @@ - net5.0 + net6.0 @@ -28,8 +28,8 @@ - - + + diff --git a/src/Paillave.Etl.sln b/src/Paillave.Etl.sln index 43e27dc5..d78819a0 100644 --- a/src/Paillave.Etl.sln +++ b/src/Paillave.Etl.sln @@ -1,5 +1,4 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 +Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 VisualStudioVersion = 15.0.26124.0 MinimumVisualStudioVersion = 15.0.26124.0 @@ -41,6 +40,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Paillave.Etl.SimpleTutorial EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Paillave.Etl.BlogTutorial", "Tutorials\BlogTutorial\BlogTutorial.csproj", "{191DFF01-0BA8-4345-AC4E-C94E8245F007}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Paillave.Etl.Bloomberg", "Paillave.Etl.Bloomberg\Paillave.Etl.Bloomberg.csproj", "{4DBE1730-6E96-43FF-A36E-FBBB16617775}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Paillave.Pdf", "Paillave.Pdf\Paillave.Pdf.csproj", "{8FD75A3B-6726-4F19-A82E-D67DE6539901}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Paillave.Etl.Pdf", "Paillave.Etl.Pdf\Paillave.Etl.Pdf.csproj", "{471FE8AD-1A89-41E4-B39D-773A82FBAA0A}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -279,6 +284,42 @@ Global {191DFF01-0BA8-4345-AC4E-C94E8245F007}.Release|x64.Build.0 = Release|Any CPU {191DFF01-0BA8-4345-AC4E-C94E8245F007}.Release|x86.ActiveCfg = Release|Any CPU {191DFF01-0BA8-4345-AC4E-C94E8245F007}.Release|x86.Build.0 = Release|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Debug|x64.ActiveCfg = Debug|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Debug|x64.Build.0 = Debug|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Debug|x86.ActiveCfg = Debug|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Debug|x86.Build.0 = Debug|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Release|Any CPU.Build.0 = Release|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Release|x64.ActiveCfg = Release|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Release|x64.Build.0 = Release|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Release|x86.ActiveCfg = Release|Any CPU + {4DBE1730-6E96-43FF-A36E-FBBB16617775}.Release|x86.Build.0 = Release|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Debug|Any CPU.Build.0 = Debug|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Debug|x64.ActiveCfg = Debug|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Debug|x64.Build.0 = Debug|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Debug|x86.ActiveCfg = Debug|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Debug|x86.Build.0 = Debug|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Release|Any CPU.ActiveCfg = Release|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Release|Any CPU.Build.0 = Release|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Release|x64.ActiveCfg = Release|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Release|x64.Build.0 = Release|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Release|x86.ActiveCfg = Release|Any CPU + {8FD75A3B-6726-4F19-A82E-D67DE6539901}.Release|x86.Build.0 = Release|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Debug|x64.ActiveCfg = Debug|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Debug|x64.Build.0 = Debug|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Debug|x86.ActiveCfg = Debug|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Debug|x86.Build.0 = Debug|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Release|Any CPU.Build.0 = Release|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Release|x64.ActiveCfg = Release|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Release|x64.Build.0 = Release|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Release|x86.ActiveCfg = Release|Any CPU + {471FE8AD-1A89-41E4-B39D-773A82FBAA0A}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/Paillave.Etl/Extensions/Do.Stream.ex.cs b/src/Paillave.Etl/Extensions/Do.Stream.ex.cs index 638c4a6f..6592c816 100644 --- a/src/Paillave.Etl/Extensions/Do.Stream.ex.cs +++ b/src/Paillave.Etl/Extensions/Do.Stream.ex.cs @@ -9,7 +9,7 @@ public static IStream Do(this IStream stream, string name, Action { return new DoStreamNode>(name, new DoArgs> { - Processor = new SimpleDoProcessor(processRow), + Processor = new SimpleDoProcessor(i => i, processRow), Stream = stream }).Output; } @@ -17,7 +17,7 @@ public static ISortedStream Do(this ISortedStream>(name, new DoArgs> { - Processor = new SimpleDoProcessor(processRow), + Processor = new SimpleDoProcessor(i => i, processRow), Stream = stream }).Output; } @@ -25,7 +25,7 @@ public static IKeyedStream Do(this IKeyedStream { return new DoStreamNode>(name, new DoArgs> { - Processor = new SimpleDoProcessor(processRow), + Processor = new SimpleDoProcessor(i => i, processRow), Stream = stream }).Output; } @@ -33,152 +33,139 @@ public static ISingleStream Do(this ISingleStream stream, string { return new DoStreamNode>(name, new DoArgs> { - Processor = new SimpleDoProcessor(processRow), + Processor = new SimpleDoProcessor(i => i, processRow), Stream = stream }).Output; } - #endregion - - #region Simple action processor - public static IStream Do(this IStream stream, string name, IDoProcessor processor) + public static IStream> Do(this IStream> stream, string name, Action processRow) { - return new DoStreamNode>(name, new DoArgs> + return new DoStreamNode, IStream>>(name, new DoArgs, IStream>> { - Processor = processor, + Processor = new SimpleDoProcessor, TIn>(i => i.Row, processRow), Stream = stream }).Output; } - public static ISortedStream Do(this ISortedStream stream, string name, IDoProcessor processor) + public static ISortedStream, TKey> Do(this ISortedStream, TKey> stream, string name, Action processRow) { - return new DoStreamNode>(name, new DoArgs> + return new DoStreamNode, ISortedStream, TKey>>(name, new DoArgs, ISortedStream, TKey>> { - Processor = processor, + Processor = new SimpleDoProcessor, TIn>(i => i.Row, processRow), Stream = stream }).Output; } - public static IKeyedStream Do(this IKeyedStream stream, string name, IDoProcessor processor) + public static IKeyedStream, TKey> Do(this IKeyedStream, TKey> stream, string name, Action processRow) { - return new DoStreamNode>(name, new DoArgs> + return new DoStreamNode, IKeyedStream, TKey>>(name, new DoArgs, IKeyedStream, TKey>> { - Processor = processor, + Processor = new SimpleDoProcessor, TIn>(i => i.Row, processRow), Stream = stream }).Output; } - #endregion - - #region Process and preprocess row - public static IStream Do(this IStream stream, string name, ISingleStream resourceStream, Action processRow, Action preProcess = null) + public static ISingleStream> Do(this ISingleStream> stream, string name, Action processRow) { - return new DoStreamNode, TResource>(name, new DoArgs, TResource> + return new DoStreamNode, ISingleStream>>(name, new DoArgs, ISingleStream>> { - Stream = stream, - ResourceStream = resourceStream, - Processor = new SimpleDoProcessor(processRow, preProcess) + Processor = new SimpleDoProcessor, TIn>(i => i.Row, processRow), + Stream = stream }).Output; } - public static ISortedStream Do(this ISortedStream stream, string name, ISingleStream resourceStream, Action processRow, Action preProcess = null) + #endregion + + + + #region Process row with injection + public static IStream Do(this IStream stream, string name, Func, IDoProcessor> o) { - return new DoStreamNode, TResource>(name, new DoArgs, TResource> + return new DoStreamNode>(name, new DoArgs> { - Stream = stream, - ResourceStream = resourceStream, - Processor = new SimpleDoProcessor(processRow, preProcess) + Processor = o(new DoWithResolutionProcessorBuilder(i => i)), + Stream = stream }).Output; } - public static IKeyedStream Do(this IKeyedStream stream, string name, ISingleStream resourceStream, Action processRow, Action preProcess = null) + public static ISortedStream Do(this ISortedStream stream, string name, Func, IDoProcessor> o) { - return new DoStreamNode, TResource>(name, new DoArgs, TResource> + return new DoStreamNode>(name, new DoArgs> { - Stream = stream, - ResourceStream = resourceStream, - Processor = new SimpleDoProcessor(processRow, preProcess) + Processor = o(new DoWithResolutionProcessorBuilder(i => i)), + Stream = stream }).Output; } - #endregion - - #region Process and preprocess processor - public static IStream Do(this IStream stream, string name, ISingleStream resourceStream, IDoProcessor processor) + public static IKeyedStream Do(this IKeyedStream stream, string name, Func, IDoProcessor> o) { - return new DoStreamNode, TResource>(name, new DoArgs, TResource> + return new DoStreamNode>(name, new DoArgs> { - Stream = stream, - ResourceStream = resourceStream, - Processor = processor + Processor = o(new DoWithResolutionProcessorBuilder(i => i)), + Stream = stream }).Output; } - public static ISortedStream Do(this ISortedStream stream, string name, ISingleStream resourceStream, IDoProcessor processor) + public static ISingleStream Do(this ISingleStream stream, string name, Func, IDoProcessor> o) { - return new DoStreamNode, TResource>(name, new DoArgs, TResource> + return new DoStreamNode>(name, new DoArgs> { - Stream = stream, - ResourceStream = resourceStream, - Processor = processor + Processor = o(new DoWithResolutionProcessorBuilder(i => i)), + Stream = stream }).Output; } - public static IKeyedStream Do(this IKeyedStream stream, string name, ISingleStream resourceStream, IDoProcessor processor) + public static IStream> Do(this IStream> stream, string name, Func, TIn>, IDoProcessor>> o) { - return new DoStreamNode, TResource>(name, new DoArgs, TResource> + return new DoStreamNode, IStream>>(name, new DoArgs, IStream>> { - Stream = stream, - ResourceStream = resourceStream, - Processor = processor + Processor = o(new DoWithResolutionProcessorBuilder, TIn>(i => i.Row)), + Stream = stream }).Output; } - #endregion - - #region Simple process row with context - public static IStream Do(this IStream stream, string name, TCtx initialContext, Action> processRow) + public static ISortedStream, TKey> Do(this ISortedStream, TKey> stream, string name, Func, TIn>, IDoProcessor>> o) { - return new DoStreamNode>(name, new DoArgs> + return new DoStreamNode, ISortedStream, TKey>>(name, new DoArgs, ISortedStream, TKey>> { - Processor = new ContextDoProcessor(processRow, initialContext), + Processor = o(new DoWithResolutionProcessorBuilder, TIn>(i => i.Row)), Stream = stream }).Output; } - public static ISortedStream Do(this ISortedStream stream, string name, TCtx initialContext, Action> processRow) + public static IKeyedStream, TKey> Do(this IKeyedStream, TKey> stream, string name, Func, TIn>, IDoProcessor>> o) { - return new DoStreamNode>(name, new DoArgs> + return new DoStreamNode, IKeyedStream, TKey>>(name, new DoArgs, IKeyedStream, TKey>> { - Processor = new ContextDoProcessor(processRow, initialContext), + Processor = o(new DoWithResolutionProcessorBuilder, TIn>(i => i.Row)), Stream = stream }).Output; } - public static IKeyedStream Do(this IKeyedStream stream, string name, TCtx initialContext, Action> processRow) + public static ISingleStream> Do(this ISingleStream> stream, string name, Func, TIn>, IDoProcessor>> o) { - return new DoStreamNode>(name, new DoArgs> + return new DoStreamNode, ISingleStream>>(name, new DoArgs, ISingleStream>> { - Processor = new ContextDoProcessor(processRow, initialContext), + Processor = o(new DoWithResolutionProcessorBuilder, TIn>(i => i.Row)), Stream = stream }).Output; } #endregion - #region Process and preprocess row with context - public static IStream Do(this IStream stream, string name, ISingleStream resourceStream, Action> processRow, Action> preProcess = null) + + + + #region Simple action processor + public static IStream Do(this IStream stream, string name, IDoProcessor processor) { - return new DoStreamNode, TResource>(name, new DoArgs, TResource> + return new DoStreamNode>(name, new DoArgs> { - Stream = stream, - ResourceStream = resourceStream, - Processor = new ContextDoProcessor(processRow, preProcess) + Processor = processor, + Stream = stream }).Output; } - public static ISortedStream Do(this ISortedStream stream, string name, ISingleStream resourceStream, Action> processRow, Action> preProcess = null) + public static ISortedStream Do(this ISortedStream stream, string name, IDoProcessor processor) { - return new DoStreamNode, TResource>(name, new DoArgs, TResource> + return new DoStreamNode>(name, new DoArgs> { - Stream = stream, - ResourceStream = resourceStream, - Processor = new ContextDoProcessor(processRow, preProcess) + Processor = processor, + Stream = stream }).Output; } - public static IKeyedStream Do(this IKeyedStream stream, string name, ISingleStream resourceStream, Action> processRow, Action> preProcess = null) + public static IKeyedStream Do(this IKeyedStream stream, string name, IDoProcessor processor) { - return new DoStreamNode, TResource>(name, new DoArgs, TResource> + return new DoStreamNode>(name, new DoArgs> { - Stream = stream, - ResourceStream = resourceStream, - Processor = new ContextDoProcessor(processRow, preProcess) + Processor = processor, + Stream = stream }).Output; } #endregion diff --git a/src/Paillave.Etl/Extensions/StreamNodes/DoStreamNode.cs b/src/Paillave.Etl/Extensions/StreamNodes/DoStreamNode.cs index 0beb9dc0..ebf6e1e8 100644 --- a/src/Paillave.Etl/Extensions/StreamNodes/DoStreamNode.cs +++ b/src/Paillave.Etl/Extensions/StreamNodes/DoStreamNode.cs @@ -1,4 +1,5 @@ using System; +using System.Threading; using Paillave.Etl.Reactive.Operators; namespace Paillave.Etl.Core @@ -6,36 +7,17 @@ namespace Paillave.Etl.Core #region Actions Without Resources public interface IDoProcessor { - void ProcessRow(TIn value); + void ProcessRow(TIn value, CancellationToken cancellationToken, IDependencyResolver resolver, IInvoker invoker); } - public class SimpleDoProcessor : IDoProcessor + public class SimpleDoProcessor : IDoProcessor { - private Action _processRow; - public SimpleDoProcessor(Action processRow) + private readonly Func _getInner; + private readonly Action _processRow; + public SimpleDoProcessor(Func getInner, Action processRow) + => (_processRow, _getInner) = (processRow, getInner); + public void ProcessRow(TIn value, CancellationToken cancellationToken, IDependencyResolver resolver, IInvoker invoker) { - _processRow = processRow; - } - public void ProcessRow(TIn value) - { - _processRow(value); - } - } - public class ContextDoProcessor : IDoProcessor - { - private Action> _processRow; - private TCtx _context; - public ContextDoProcessor(Action> processRow, TCtx initialContext) - { - _processRow = processRow; - _context = initialContext; - } - public void ProcessRow(TIn value) - { - _processRow(value, _context, SetContext); - } - private void SetContext(TCtx newContext) - { - _context = newContext; + _processRow(_getInner(value)); } } public class DoArgs where TStream : IStream @@ -50,87 +32,50 @@ public DoStreamNode(string name, DoArgs args) : base(name, args) { public override ProcessImpact MemoryFootPrint => ProcessImpact.Light; protected override TStream CreateOutputStream(DoArgs args) { - return base.CreateMatchingStream(args.Stream.Observable.Do(args.Processor.ProcessRow), args.Stream); + return base.CreateMatchingStream(args.Stream.Observable.Do(i => args.Processor.ProcessRow(i, CancellationToken.None, args.Stream.SourceNode.ExecutionContext.DependencyResolver, args.Stream.SourceNode.ExecutionContext)), args.Stream); } } #endregion - - #region Actions With Resources - public interface IDoProcessor - { - void PreProcess(TResource resource); - void ProcessRow(TIn value, TResource resource); - } - public class SimpleDoProcessor : IDoProcessor + public class DoWithResolutionProcessor : IDoProcessor where TService : class { - private Action _processRow; - private Action _preProcess; - public SimpleDoProcessor(Action processRow, Action preProcess = null) - { - _preProcess = preProcess; - _processRow = processRow; - } - public void PreProcess(TResource resource) - { - _preProcess?.Invoke(resource); - } + private readonly Action _actionFull = null; + private readonly Action _actionSimple = null; + private readonly Func _getInner; + private TService _service = null; + private readonly object _lock = new Object(); + public DoWithResolutionProcessor(Action actionFull, Func getInner) => (_actionFull, _getInner) = (actionFull, getInner); + public DoWithResolutionProcessor(Action actionSimple, Func getInner) => (_actionSimple, _getInner) = (actionSimple, getInner); - public void ProcessRow(TIn value, TResource resource) - { - _processRow(value, resource); - } - } - public class ContextDoProcessor : IDoProcessor - { - private Action> _processRow; - private Action> _preProcess; - private TCtx _context; - public ContextDoProcessor(Action> processRow, Action> preProcess = null) - { - _preProcess = preProcess; - _processRow = processRow; - _context = default(TCtx); - } - public void ProcessRow(TIn value, TResource resource) - { - _processRow(value, resource, _context, SetContext); - } - public void PreProcess(TResource resource) - { - _preProcess?.Invoke(resource, SetContext); - } - private void SetContext(TCtx newContext) + public void ProcessRow(TIn value, CancellationToken cancellationToken, IDependencyResolver resolver, IInvoker invoker) { - _context = newContext; + lock (_lock) + { + if (_service == null) + { + _service = resolver.Resolve(); + } + } + if (_actionFull != null) + { + _actionFull(_getInner(value), _service, cancellationToken, invoker); + } + else if (_actionSimple != null) + { + _actionSimple(_getInner(value), _service); + } } } - public class DoArgs where TStream : IStream + public class DoWithResolutionProcessorBuilder { - public TStream Stream { get; set; } - public ISingleStream ResourceStream { get; set; } - public IDoProcessor Processor { get; set; } + private readonly Func _getInner; + public DoWithResolutionProcessorBuilder(Func getInner) => _getInner = getInner; + public DoWithResolutionProcessorBuilder Resolve() where TService : class => new DoWithResolutionProcessorBuilder(_getInner); } - public class DoStreamNode : StreamNodeBase> where TStream : IStream + public class DoWithResolutionProcessorBuilder where TService : class { - public DoStreamNode(string name, DoArgs args) : base(name, args) - { - } - - public override ProcessImpact PerformanceImpact => ProcessImpact.Light; - - public override ProcessImpact MemoryFootPrint => ProcessImpact.Light; - - protected override TStream CreateOutputStream(DoArgs args) - { - var firstStreamWriter = args.ResourceStream.Observable; - //if (args.PreProcess != null) - firstStreamWriter = firstStreamWriter - .Do(i => args.Processor.PreProcess(i)) - .DelayTillEndOfStream(); - var obs = args.Stream.Observable - .CombineWithLatest(firstStreamWriter, (i, r) => { args.Processor.ProcessRow(i, r); return i; }, true); - return CreateMatchingStream(obs, args.Stream); - } + private readonly Func _getInner; + public DoWithResolutionProcessorBuilder(Func getInner) => _getInner = getInner; + public void ThenDo(Action actionFull) => new DoWithResolutionProcessor(actionFull, _getInner); + public void ThenDo(Action actionSimple) => new DoWithResolutionProcessor(actionSimple, _getInner); } - #endregion } diff --git a/src/Paillave.Etl/Paillave.Etl.csproj b/src/Paillave.Etl/Paillave.Etl.csproj index 34dfa70b..999b59e4 100644 --- a/src/Paillave.Etl/Paillave.Etl.csproj +++ b/src/Paillave.Etl/Paillave.Etl.csproj @@ -18,7 +18,7 @@ - net5.0 + net6.0 latest diff --git a/src/Paillave.Pdf/ApproximateEqualityComparer.cs b/src/Paillave.Pdf/ApproximateEqualityComparer.cs new file mode 100644 index 00000000..99d65eb4 --- /dev/null +++ b/src/Paillave.Pdf/ApproximateEqualityComparer.cs @@ -0,0 +1,12 @@ +using System.Collections.Generic; + +namespace Paillave.Pdf +{ + public class ApproximateEqualityComparer : IEqualityComparer + { + private readonly double _proximity = 0; + public ApproximateEqualityComparer(double proximity = 0) => _proximity = proximity; + public bool Equals(double x1, double x2) => x1 - _proximity <= x2 && x1 + _proximity >= x2; + public int GetHashCode(double obj) => 0; + } +} diff --git a/src/Paillave.Pdf/EnumerationEx.cs b/src/Paillave.Pdf/EnumerationEx.cs new file mode 100644 index 00000000..7529ca72 --- /dev/null +++ b/src/Paillave.Pdf/EnumerationEx.cs @@ -0,0 +1,45 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Paillave.Pdf +{ + public static class EnumerationEx + { + public static IEnumerable<(T, T)> Pair(this IEnumerable range) + { + bool hasFirstElement = false; + T previousElement = default; + foreach (var item in range) + { + if (hasFirstElement) yield return (previousElement, item); + else hasFirstElement = true; + previousElement = item; + } + } + public static (int, int) GetPosition(this IList range, T elt) where T : IComparable + => GetPosition(range, elt, new LambdaComparer((x, y) => x.CompareTo(y))); + public static (int, int) GetPosition(this IList range, T elt, Func compare) + => GetPosition(range, elt, new LambdaComparer((x, y) => compare(x, y))); + public static (int, int) GetPosition(this IList range, T elt, IComparer comparer) + { + int leftBound = 0; + int rightBound = range.Count - 1; + while (leftBound < rightBound - 1) + { + int middleBound = (leftBound + rightBound) / 2; + if (comparer.Compare(range[middleBound], elt) > 0) rightBound = middleBound; + else leftBound = middleBound; + } + return (leftBound, rightBound); + } + public static bool Contains(this IEnumerable range, double value, double proximity) + => proximity == 0 ? range.Contains(value) : range.Any(i => value - proximity <= i && value + proximity >= i); + private class LambdaComparer : IComparer + { + private readonly Func _compare; + public LambdaComparer(Func compare) => (_compare) = (compare); + public int Compare(T x, T y) => _compare(x, y); + } + } +} \ No newline at end of file diff --git a/src/Paillave.Pdf/Grid.cs b/src/Paillave.Pdf/Grid.cs new file mode 100644 index 00000000..13e6da24 --- /dev/null +++ b/src/Paillave.Pdf/Grid.cs @@ -0,0 +1,162 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Content; + +namespace Paillave.Pdf +{ + public class Grid : IBounds + { + private readonly double _proximity; + public Grid(double proximity = 0) => _proximity = proximity; + public double Top { get; private set; } + public double Bottom { get; private set; } + public double Left { get; private set; } + public double Right { get; private set; } + public List LinesX { get; private set; } = new List(); + public List LinesY { get; private set; } = new List(); + private List _relatedLines = new List(); + private List _simplifiedRelatedLines = new List(); + private List _cellLines = new List(); + private List[][] _content; + internal void Build() + { + BuildLines(); + if (LinesY.Count > 1 && LinesX.Count > 1) + _content = Enumerable.Range(0, LinesY.Count - 1).Select(i => Enumerable.Range(0, LinesX.Count - 1).Select(j => new List()).ToArray()).ToArray(); + else + _content = new[] { new[] { new List() } }; + } + private void BuildLines() + { + _simplifiedRelatedLines = SimplifyLines(_relatedLines); + if (_simplifiedRelatedLines.Count > 0) + { + this.Bottom = _simplifiedRelatedLines.Min(l => l.Bottom); + this.Top = _simplifiedRelatedLines.Max(l => l.Top); + this.Left = _simplifiedRelatedLines.Min(l => l.Left); + this.Right = _simplifiedRelatedLines.Max(l => l.Right); + } + _cellLines = CleanUpInnerLines(_simplifiedRelatedLines); + this.LinesX = _cellLines.Where(l => l.IsVertical).Select(l => l.Left).Distinct().OrderBy(i => i).ToList(); + this.LinesY = _cellLines.Where(l => l.IsHorizontal).Select(l => l.Top).Distinct().OrderBy(i => i).ToList(); + } + private List CleanUpInnerLines(List lines) + { + var top = this.Top - _proximity; + var bottom = this.Bottom + _proximity; + var right = this.Right - _proximity; + var left = this.Left + _proximity; + var linesX = lines.Where(l => l.IsVertical && l.Top >= top && l.Bottom <= bottom).Select(l => l.Left).Distinct().OrderBy(i => i).ToList(); + var linesY = lines.Where(l => l.IsHorizontal && l.Left <= left && l.Right >= right).Select(l => l.Top).Distinct().OrderBy(i => i).ToList(); + var linesToRemove = linesX.Pair().SelectMany(x => linesY.Pair().Select(y => new CellBounds(y.Item2, y.Item1, x.Item1, x.Item2))).SelectMany(c => lines.Where(l => Contains(c, l))).ToHashSet(); + + return lines.Where(l => !linesToRemove.Contains(l)).ToList(); + } + private class CellBounds : IBounds + { + public CellBounds(double top, double bottom, double left, double right) + => (Top, Bottom, Left, Right) = (top, bottom, left, right); + public double Top { get; } + public double Bottom { get; } + public double Left { get; } + public double Right { get; } + } + private bool Contains(IBounds outer, IBounds inner) + => outer.Left - _proximity < inner.Left && outer.Right + _proximity > inner.Right + && outer.Bottom - _proximity < inner.Bottom && outer.Top + _proximity > inner.Top; + private List SimplifyLines(List lines) + { + var comparer = new ApproximateEqualityComparer(_proximity); + var horizontalLines = lines + .Where(line => line.IsHorizontal && !line.IsVertical) + .GroupBy(line => line.Top, comparer) + .SelectMany(lines => SimplifySegments(lines.OrderBy(line => line.Left).Select(line => (line.Left, line.Right))).Select(line => new GridLine( + new UglyToad.PdfPig.Core.PdfPoint(line.Item1, lines.Key), + new UglyToad.PdfPig.Core.PdfPoint(line.Item2, lines.Key), + _proximity, + null))).ToList(); + var verticalLines = lines + .Where(line => line.IsVertical && !line.IsHorizontal) + .GroupBy(line => line.Left, comparer) + .SelectMany(lines => SimplifySegments(lines.OrderBy(line => line.Bottom).Select(line => (line.Bottom, line.Top))).Select(line => new GridLine( + new UglyToad.PdfPig.Core.PdfPoint(lines.Key, line.Item1), + new UglyToad.PdfPig.Core.PdfPoint(lines.Key, line.Item2), + _proximity, + null))).ToList(); + return horizontalLines.Union(verticalLines).ToList(); + } + private IEnumerable<(double, double)> SimplifySegments(IEnumerable<(double, double)> segments) + { + double? currentEnd = null; + double? currentStart = null; + foreach (var (start, end) in segments) + { + if (currentStart == null) currentStart = start; + if (currentEnd != null && start > currentEnd.Value + _proximity) + { + if (currentStart.Value <= currentEnd.Value + _proximity) + yield return (currentStart.Value, currentEnd.Value); + currentStart = start; + } + if (currentEnd == null) currentEnd = end; + else currentEnd = Math.Max(end, currentEnd.Value); + } + if (currentStart.Value <= currentEnd.Value + _proximity) + yield return (currentStart.Value, currentEnd.Value); + } + private static IEnumerable ReflowText(IEnumerable words) => new LinesOfWords(words).GetLines(); + public bool TryAddWord(Word word) + { + if (this.TryGetRelatedCell(word, out var loc)) + { + var rowNumber = this.LinesY.Count - 2 - loc.row; + _content[rowNumber][loc.column].Add(word); + return true; + } + return false; + } + public List>> GetContent() + => _content.Select(row => row.Select(cell => ReflowText(cell).ToList()).ToList()).ToList(); + public void AddLine(GridLine gridLine) => _relatedLines.Add(gridLine); + public bool TryGetRelatedCell(Word textLine, out (int row, int column) cell) + { + var centerX = textLine.BoundingBox.Centroid.X; + var centerY = textLine.BoundingBox.Centroid.Y; + // var centerX = (textLine.BoundingBox.Left + textLine.BoundingBox.Right) / 2; + // var centerY = (textLine.BoundingBox.Top + textLine.BoundingBox.Bottom) / 2; + cell = default; + if (centerX < this.Left || centerX > this.Right) return false; + if (centerY < this.Bottom || centerY > this.Top) return false; + cell = ( + this.LinesY.GetPosition(centerY).Item1, + this.LinesX.GetPosition(centerX).Item1 + ); + return true; + } + public void DrawGrid(SvgBuilder builder) + { + foreach (var x in LinesX) builder.VerticalLine(x, this.Top, this.Bottom); + foreach (var y in LinesY) builder.HorizontalLine(this.Left, y, this.Right); + builder.Rectangle(this.Left, this.Top, this.Right, this.Bottom, "orange"); + } + public void DrawCellLines(SvgBuilder builder) + { + foreach (var x in _cellLines) + builder.Line(x.Left, x.Top, x.Right, x.Bottom); + builder.Rectangle(this.Left, this.Top, this.Right, this.Bottom, "orange"); + } + public void DrawRelatedLines(SvgBuilder builder) + { + foreach (var x in _relatedLines) + builder.Line(x.Left, x.Top, x.Right, x.Bottom); + builder.Rectangle(this.Left, this.Top, this.Right, this.Bottom, "orange"); + } + public void DrawSimplifiedRelatedLines(SvgBuilder builder) + { + foreach (var x in _simplifiedRelatedLines) + builder.Line(x.Left, x.Top, x.Right, x.Bottom); + builder.Rectangle(this.Left, this.Top, this.Right, this.Bottom, "orange"); + } + } +} diff --git a/src/Paillave.Pdf/GridExtractor.cs b/src/Paillave.Pdf/GridExtractor.cs new file mode 100644 index 00000000..b1539f95 --- /dev/null +++ b/src/Paillave.Pdf/GridExtractor.cs @@ -0,0 +1,143 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Core; +using static UglyToad.PdfPig.Core.PdfSubpath; + +namespace Paillave.Pdf +{ + public class GridExtractor + { + private readonly double _proximity; + public GridExtractor(double proximity = 3) => _proximity = proximity; + private IEnumerable ExtractLines(Page page) + { + foreach (var path in page.ExperimentalAccess.Paths) + { + var color = path.FillColor ?? path.StrokeColor; + if (!path.IsFilled && !path.IsStroked) continue; + if (color == null || color.ToRGBValues() == (1, 1, 1)) continue; + foreach (var subpath in path) + { + if (!(subpath.Commands[0] is Move first)) continue; + if (subpath.Commands.Any(c => c is BezierCurve)) continue; + + PdfPoint? start_pos = first.Location; + PdfPoint? last_move = start_pos; + PdfPoint? end_pos = null; + + foreach (var command in subpath.Commands) + { + switch (command) + { + case Line linePath: + end_pos = linePath.To; + if (start_pos == null) break; + yield return new GridLine(start_pos.Value, linePath.To, _proximity, color); + break; + case Move move: + start_pos = move.Location; + end_pos = start_pos; + break; + case Close: + if (start_pos == null || end_pos == null) break; + yield return new GridLine(last_move.Value, end_pos.Value, _proximity, color); + break; + } + start_pos = end_pos; + } + } + } + } + public GridExtraction Extract(Page page, bool debug = false) + { + var rulings = ExtractLines(page); + var grids = GetGrids(rulings).ToList(); + foreach (var grid in grids) grid.Build(); + + grids = ExcludeInnerBounds(grids).Where(i => i.LinesX.Count > 2 && i.LinesY.Count > 2).ToList(); + + var outOfScopeHorizontalLines = rulings + .Where(r => r.IsHorizontal && !grids.Any(g => g.Top + _proximity >= r.Top && g.Bottom - _proximity <= r.Top)) + .Select(l => new HorizontalLine(l.Top, l.Left, l.Right)).ToList(); + +#if DEBUG + if (debug && grids.Any()) + { + var svg = new SvgBuilder((int)page.Width, (int)page.Height, page.Number, "RelatedLines", true); + foreach (var item in grids) item.DrawRelatedLines(svg); + svg.Show(); + // var svg2 = new SvgBuilder((int)page.Width, (int)page.Height, page.Number, true); + // foreach (var item in grids) item.DrawSimplifiedRelatedLines(svg2); + // svg2.Show("SimplifiedRelatedLines"); + // var svg3 = new SvgBuilder((int)page.Width, (int)page.Height, page.Number, true); + // foreach (var item in grids) item.DrawCellLines(svg3); + // svg3.Show("CellLines"); + var svg4 = new SvgBuilder((int)page.Width, (int)page.Height, page.Number, "Grid", true); + foreach (var item in grids) item.DrawGrid(svg4); + svg4.Show(); + } +#endif + return new GridExtraction + { + Grids = grids, + OutOfScopeHorizontalLines = outOfScopeHorizontalLines + }; + } + private IEnumerable GetGrids(IEnumerable inputLines) + { + var grids = new List(); + var lines = inputLines.Where(i => i.IsHorizontal || i.IsVertical).ToList(); + var processedLines = new HashSet(); + foreach (var line in lines) + if (!processedLines.Contains(line)) + yield return GetGrids(line, lines, processedLines); + } + private Grid GetGrids(GridLine line, List lines, HashSet processedLines, Grid grid = null) + { + if (grid == null) grid = new Grid(_proximity); + grid.AddLine(line); + processedLines.Add(line); + var intersects = lines.Where(l => !processedLines.Contains(l) && Intersects(l, line)).ToList(); + foreach (var intersect in intersects) GetGrids(intersect, lines, processedLines, grid); + return grid; + } + private bool Intersects(GridLine line1, GridLine line2) + { + if (line1.IsHorizontal == line2.IsHorizontal) return false; + if (line1.IsHorizontal) return line1.Top <= (line2.Top + _proximity) && line1.Top >= (line2.Bottom - _proximity) + && line2.Left >= (line1.Left - _proximity) && line2.Left <= (line1.Right + _proximity); + return line2.Top <= (line1.Top + _proximity) && line2.Top >= (line1.Bottom - _proximity) + && line1.Left >= (line2.Left - _proximity) && line1.Left <= (line2.Right + _proximity); + } + private List ExcludeInnerBounds(List bounds) where B : IBounds + { + var outerBounds = new List(); + foreach (var bound in bounds) + { + if (outerBounds.FindIndex(i => Contains(i, bound)) < 0) + { + outerBounds.RemoveAll(i => Contains(bound, i)); + outerBounds.Add(bound); + } + } + return outerBounds; + } + private bool Contains(IBounds outerBound, IBounds innerBound) + => outerBound.Left <= innerBound.Left && outerBound.Right >= innerBound.Right && outerBound.Top >= innerBound.Top && outerBound.Bottom <= innerBound.Bottom; + } + public class GridExtraction + { + public List Grids { get; set; } + // useful to detect is a text is underlined + public List OutOfScopeHorizontalLines { get; set; } + } + public class HorizontalLine + { + public HorizontalLine(double y, double left, double right) => (Y, Left, Right) = (y, left, right); + public double Y { get; } + public double Left { get; } + public double Right { get; } + } +} diff --git a/src/Paillave.Pdf/GridLine.cs b/src/Paillave.Pdf/GridLine.cs new file mode 100644 index 00000000..725f0d9e --- /dev/null +++ b/src/Paillave.Pdf/GridLine.cs @@ -0,0 +1,51 @@ +using System; +using UglyToad.PdfPig.Core; +using UglyToad.PdfPig.DocumentLayoutAnalysis; +using UglyToad.PdfPig.Graphics.Colors; + +namespace Paillave.Pdf +{ + public class GridLine : IBounds + { + public PdfPoint P1 { get; } + public PdfPoint P2 { get; } + public double Left { get; } + public double Right { get; } + public double Top { get; } + public double Bottom { get; } + public bool IsVertical { get; } + public bool IsHorizontal { get; } + public IColor Color { get; } + public GridLine(PdfPoint p1, PdfPoint p2, double proximity, IColor color) + { + this.Color = color; + this.P1 = p1; + var x2 = p1.X <= p2.X + proximity && p1.X >= p2.X - proximity ? p1.X : p2.X; + var y2 = p1.Y <= p2.Y + proximity && p1.Y >= p2.Y - proximity ? p1.Y : p2.Y; + this.P2 = new PdfPoint(x2, x2); + if (P1.Y > y2) + { + this.Top = P1.Y; + this.Bottom = y2; + } + else + { + this.Top = y2; + this.Bottom = P1.Y; + } + if (P1.X > x2) + { + this.Right = P1.X; + this.Left = x2; + } + else + { + this.Right = x2; + this.Left = P1.X; + } + this.IsHorizontal = this.Top == this.Bottom; + this.IsVertical = this.Left == this.Right; + } + public override string ToString() => $"{this.GetType()}[x1={this.P1.X:0},y1={this.P1.Y:0},x2={this.P2.X:0},y2={this.P2.Y:0},rgb=({this.Color.ToRGBValues()})]"; + } +} diff --git a/src/Paillave.Pdf/IBounds.cs b/src/Paillave.Pdf/IBounds.cs new file mode 100644 index 00000000..3ed45bb4 --- /dev/null +++ b/src/Paillave.Pdf/IBounds.cs @@ -0,0 +1,10 @@ +namespace Paillave.Pdf +{ + public interface IBounds + { + double Top { get; } + double Bottom { get; } + double Left { get; } + double Right { get; } + } +} diff --git a/src/Paillave.Pdf/IPdfProcessor.cs b/src/Paillave.Pdf/IPdfProcessor.cs new file mode 100644 index 00000000..ca70e2e3 --- /dev/null +++ b/src/Paillave.Pdf/IPdfProcessor.cs @@ -0,0 +1,11 @@ +using System.Collections.Generic; + +namespace Paillave.Pdf +{ + public interface IPdfProcessor + { + void ProcessLine(string text, int pageNumber, int lineNumber, int lineNumberInParagraph, int lineNumberInPage, List section); + void ProcessHeader(List section, int pageNumber); + void ProcessTable(List>> table, int pageNumber, List section); + } +} diff --git a/src/Paillave.Pdf/LinesOfWords.cs b/src/Paillave.Pdf/LinesOfWords.cs new file mode 100644 index 00000000..54824d9e --- /dev/null +++ b/src/Paillave.Pdf/LinesOfWords.cs @@ -0,0 +1,62 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Content; + +namespace Paillave.Pdf +{ + public class LinesOfWords + { + private class LineOfWords + { + private readonly List _words = new List(); + private int _topSum; + public int TopAverage { get; private set; } + private int? _topMax; + private int _bottomSum; + private int _bottomAverage; + private int? _bottomMax; + public bool Belongs(Word word) + { + if (_topMax < word.BoundingBox.Centroid.Y || _bottomMax > word.BoundingBox.Centroid.Y) + // if (TopAverage < word.BoundingBox.Centroid.Y || _bottomAverage > word.BoundingBox.Centroid.Y) + return false; + return true; + } + public void Add(Word word) + { + this._words.Add(word); + this._topSum += (int)word.BoundingBox.Top; + this.TopAverage = this._topSum / _words.Count; + this._topMax = Math.Max(this._topMax ?? (int)word.BoundingBox.Top, (int)word.BoundingBox.Top); + this._bottomSum += (int)word.BoundingBox.Bottom; + this._bottomAverage = this._bottomSum / _words.Count; + this._bottomMax = Math.Max(this._bottomMax ?? (int)word.BoundingBox.Bottom, (int)word.BoundingBox.Bottom); + } + public override string ToString() + => string.Join(" ", _words.OrderBy(word => word.BoundingBox.Left).Select(word => word.Text)); + } + public LinesOfWords() { } + public LinesOfWords(IEnumerable words) => AddWords(words); + private readonly List _lines = new List(); + public void AddWord(Word word) + { + var targetLine = _lines.FirstOrDefault(line => line.Belongs(word)); + if (targetLine == null) + { + targetLine = new LineOfWords(); + _lines.Add(targetLine); + } + targetLine.Add(word); + } + public void AddWords(IEnumerable words) + { + foreach (var word in words) + this.AddWord(word); + } + public List GetLines() + => _lines.OrderByDescending(line => line.TopAverage).Select(line => line.ToString()).ToList(); + } +} +//TODO extract structure from expressions +//TODO locate table within structure diff --git a/src/Paillave.Pdf/Paillave.Pdf.csproj b/src/Paillave.Pdf/Paillave.Pdf.csproj new file mode 100644 index 00000000..c8bf7cb1 --- /dev/null +++ b/src/Paillave.Pdf/Paillave.Pdf.csproj @@ -0,0 +1,31 @@ + + + Paillave.Pdf + 2.0.5 + Stéphane Royer + + true + MIT + https://paillave.github.io/Etl.Net/ + Pdf file + PDF files extensions + NugetIcon.png + Pdf files extensions + +extensions to deal with PDF files + + latest + + + + + + + net6.0 + + + + + + + diff --git a/src/Paillave.Pdf/PdfBlockLine.cs b/src/Paillave.Pdf/PdfBlockLine.cs new file mode 100644 index 00000000..bd745ca1 --- /dev/null +++ b/src/Paillave.Pdf/PdfBlockLine.cs @@ -0,0 +1,8 @@ +namespace Paillave.Pdf +{ + public class PdfBlockLine + { + public string Text { get; set; } + public string[] TitlePath { get; set; } + } +} diff --git a/src/Paillave.Pdf/PdfReader.cs b/src/Paillave.Pdf/PdfReader.cs new file mode 100644 index 00000000..9bf98478 --- /dev/null +++ b/src/Paillave.Pdf/PdfReader.cs @@ -0,0 +1,86 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using UglyToad.PdfPig; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.DocumentLayoutAnalysis; +using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; +using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; + +namespace Paillave.Pdf +{ + public class PdfReader : IDisposable + { + private readonly PdfDocument _pdfDocument; + private IList _patternsToIgnore; + private readonly StructureReader _structureReader; + + public PdfReader(Stream pdfStream, IList patternsToIgnore = null, IList titleSetups = null) + { + _patternsToIgnore = patternsToIgnore; + _pdfDocument = PdfDocument.Open(pdfStream); + _structureReader = new StructureReader(_pdfDocument, titleSetups); + } + public void Dispose() + { + _pdfDocument.Dispose(); + } + + private bool IgnoreLine(TextLine textLine, Page page, List lines) => _patternsToIgnore == null ? false : _patternsToIgnore.Any(i => i.Check(textLine, page, lines)); + private Grid TryAddInGrid(TextLine textLine, List grids) // ALERT! this way to do prevents to have several grids at the same horizontal level (not likely to happen, but better to manage it) + { + Grid targetedGrid = null; + foreach (var word in textLine.Words) + foreach (var grid in grids) + if (grid.TryAddWord(word)) + targetedGrid = grid; + return targetedGrid; + + } + public void Read(IPdfProcessor pdfProcessor) + { + int lineNumberInPage = 0; + int lineNumberInParagraph = 0; + int lineNumber = 0; + + foreach (Page page in _pdfDocument.GetPages()) + { + lineNumberInPage = 0; + var words = page.GetWords(NearestNeighbourWordExtractor.Instance); + var gridExtractionResult = new GridExtractor().Extract(page); + var blocks = RecursiveXYCut.Instance + .GetBlocks(words, new RecursiveXYCut.RecursiveXYCutOptions { MinimumWidth = page.Width / 3 }) + .OrderByDescending(i => i.BoundingBox.Top) + .ToList(); + Dictionary> gridSections = new Dictionary>(); + foreach (var block in blocks) + { + foreach (var textLine in block.TextLines.Where(i => !IgnoreLine(i, page, gridExtractionResult.OutOfScopeHorizontalLines))) + { + var grid = this.TryAddInGrid(textLine, gridExtractionResult.Grids); + if (grid != null) + { + gridSections[grid] = _structureReader.Current; + } + else if (_structureReader.ProcessLine(textLine, page, gridExtractionResult.OutOfScopeHorizontalLines)) + { + lineNumberInParagraph = 0; + pdfProcessor.ProcessHeader(_structureReader.Current, page.Number); + } + else + { + pdfProcessor.ProcessLine(textLine.Text, page.Number, ++lineNumber, ++lineNumberInParagraph, ++lineNumberInPage, _structureReader.Current); + } + } + } + foreach (var grid in gridExtractionResult.Grids.OrderByDescending(i => i.Top)) + { + var rows = grid.GetContent(); + gridSections.TryGetValue(grid, out var section); + pdfProcessor.ProcessTable(rows, page.Number, section); + } + } + } + } +} \ No newline at end of file diff --git a/src/Paillave.Pdf/PointEqualityComparer.cs b/src/Paillave.Pdf/PointEqualityComparer.cs new file mode 100644 index 00000000..75c109fe --- /dev/null +++ b/src/Paillave.Pdf/PointEqualityComparer.cs @@ -0,0 +1,18 @@ +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using UglyToad.PdfPig.Core; + +namespace Paillave.Pdf +{ + public class PointEqualityComparer : IEqualityComparer + { + private readonly double _proximity = 0; + public PointEqualityComparer() { } + public PointEqualityComparer(double proximity) => _proximity = proximity; + + public bool Equals(PdfPoint a, PdfPoint b) + => a.X >= b.X - _proximity && a.X <= b.X + _proximity + && a.Y >= b.Y - _proximity && a.Y <= b.Y + _proximity; + public int GetHashCode(PdfPoint obj) => 0; + } +} diff --git a/src/Paillave.Pdf/StructureReader.cs b/src/Paillave.Pdf/StructureReader.cs new file mode 100644 index 00000000..c624fe32 --- /dev/null +++ b/src/Paillave.Pdf/StructureReader.cs @@ -0,0 +1,111 @@ +using System; +using System.Collections.Generic; +using System.Collections.ObjectModel; +using System.Linq; +using UglyToad.PdfPig; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.DocumentLayoutAnalysis; +using UglyToad.PdfPig.Outline; + +namespace Paillave.Pdf +{ + public class StructureReader + { + private readonly HeadersSetup _rootHeaderLevel = new HeadersSetup(); + private List _currentHeaderPath = new List(); + + public StructureReader(PdfDocument pdfDocument, IList levelHeader) + { + _currentHeaderPath = new List { _rootHeaderLevel }; + if (levelHeader != null) + _rootHeaderLevel.DirectSubLevels.AddRange(levelHeader); + var _bookmarks = GetBookmarks(pdfDocument) ?? new List>(); + var bookmarkTemplates = _bookmarks.GroupBy(i => i.Count).Select(i => new { Name = i.Key, Nodes = i.ToList() }).ToList(); + HeadersSetup current = _rootHeaderLevel; + foreach (var bookmarkTemplate in bookmarkTemplates) + { + var newLevel = new HeadersSetup(o => o.Bookmark(bookmarkTemplate.Nodes)); + current.DirectSubLevels.Add(newLevel); + current = newLevel; + } + } + private List> GetBookmarks(IReadOnlyList nodes, List parentPath) + { + var ret = new List>(); + foreach (var bookmark in nodes.OfType()) + { + var currentPath = new List { bookmark }; + currentPath.AddRange(parentPath); + ret.Add(currentPath); + ret.AddRange(GetBookmarks(currentPath)); + } + return ret; + } + private List> GetBookmarks(List parents) => GetBookmarks(parents[0].Children, parents); + private List> GetBookmarks(PdfDocument pdfDocument) + { + if (!pdfDocument.TryGetBookmarks(out var bookmarks)) return null; + return GetBookmarks(bookmarks.Roots, new List()); + } + // public string[] GetCurrent() => _current.Select.ToDictionary(i => i.Key, i => i.Value); + public List Current { get; private set; } = new List(); + public bool ProcessLine(TextLine textLine, Page page, List lines) + { + var successfullCheck = this._currentHeaderPath.Last().DirectSubLevels.FirstOrDefault(i => i.Template.Check(textLine, page, lines)); + if (successfullCheck != null) + { + this.Current = Current.ToList(); + Current.Add(textLine.Text); + _currentHeaderPath.Add(successfullCheck); + return true; + } + if (this._currentHeaderPath.Count == 1) + { + return false; + } + if (this._currentHeaderPath.Count == 2) + { + var subRootSuccessfullCheck = _rootHeaderLevel.DirectSubLevels.FirstOrDefault(i => i.Template.Check(textLine, page, lines)); + if (subRootSuccessfullCheck != null) + { + _currentHeaderPath.RemoveAt(1); + _currentHeaderPath.Add(subRootSuccessfullCheck); + this.Current = Current.ToList(); + Current.Clear(); + Current.Add(textLine.Text); + return true; + } + } + else + { + var successfullParentsCheck = this._currentHeaderPath + .Select((i, idx) => new { HeaderLevel = i, Index = idx }) + .Skip(1) + .AsEnumerable().Reverse() + .FirstOrDefault(i => i.HeaderLevel.Template.Check(textLine, page, lines)); + if (successfullParentsCheck != null) + { + this.Current = Current.ToList(); + for (int i = _currentHeaderPath.Count - 1; i >= successfullParentsCheck.Index; i--) + { + _currentHeaderPath.RemoveAt(i); + Current.RemoveAt(i - 1); + } + Current.Add(textLine.Text); + _currentHeaderPath.Add(successfullParentsCheck.HeaderLevel); + return true; + } + } + return false; + } + } + public class HeadersSetup + { + internal HeadersSetup(params HeadersSetup[] directSubLevels) + => (Template, DirectSubLevels) = (null, directSubLevels.ToList()); + public HeadersSetup(Func templateBuilder, params HeadersSetup[] directSubLevels) + => (Template, DirectSubLevels) = (templateBuilder(new TextTemplate()), directSubLevels.ToList()); + public TextTemplate Template { get; } + public List DirectSubLevels { get; } + } +} \ No newline at end of file diff --git a/src/Paillave.Pdf/SvgBuilder.cs b/src/Paillave.Pdf/SvgBuilder.cs new file mode 100644 index 00000000..c1365b5e --- /dev/null +++ b/src/Paillave.Pdf/SvgBuilder.cs @@ -0,0 +1,76 @@ +using System.Collections.Generic; +using System.Text; + +namespace Paillave.Pdf +{ + public class SvgBuilder + { + private readonly List _commands = new List(); + private readonly double _width; + private readonly double _height; + private const string PageColor = "blue"; + private const string LabelColor = "red"; + public SvgBuilder(double width, double height, double pageNumber, string title, bool drawPage) + { + _width = width; + _height = height; + if (drawPage) + { + Rectangle(0, 0, width, height, PageColor); + Text(width / 2, 5, pageNumber.ToString(), PageColor); + Text(_width / 2, _height - 25, title, PageColor); + } + } + public SvgBuilder HorizontalLine(double left, double top, double right, string label = null) + { + _commands.Add($""); + if (label != null) Text((left + right) / 2, top, label, LabelColor); + return this; + } + public SvgBuilder VerticalLine(double left, double top, double bottom, string label = null) + { + _commands.Add($""); + if (label != null) Text(left, (top + bottom) / 2, label, LabelColor); + return this; + } + public SvgBuilder Rectangle(double left, double top, double right, double bottom, string color = null) + { + if (color == null) _commands.Add($""); + else _commands.Add($""); + return this; + } + public SvgBuilder Line(double left, double top, double right, double bottom, string color = null, string label = null) + { + if (color == null) _commands.Add($""); + else _commands.Add($""); + if (label != null) Text((left + right) / 2, (top + bottom) / 2, label, LabelColor); + return this; + } + public SvgBuilder Text(double left, double top, string text, string color = null) + { + if (color == null) _commands.Add($"{text}"); + else _commands.Add($"{text}"); + return this; + } + public void Show() => Tools.OpenFile(GetSvg(true), "html"); + public string GetSvg(bool wrapsWithHtml) + { + var sb = new StringBuilder(); + if (wrapsWithHtml) + { + sb.AppendLine(""); + sb.AppendLine(""); + sb.AppendLine(""); + } + sb.AppendLine($""); + foreach (var command in _commands) sb.AppendLine(command); + sb.AppendLine(""); + if (wrapsWithHtml) + { + sb.AppendLine(""); + sb.AppendLine(""); + } + return sb.ToString(); + } + } +} \ No newline at end of file diff --git a/src/Paillave.Pdf/TextTemplate.cs b/src/Paillave.Pdf/TextTemplate.cs new file mode 100644 index 00000000..4dd28cda --- /dev/null +++ b/src/Paillave.Pdf/TextTemplate.cs @@ -0,0 +1,169 @@ +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.DocumentLayoutAnalysis; +using UglyToad.PdfPig.Outline; + +namespace Paillave.Pdf +{ + public class TextTemplate + { + private readonly List _checks = new List(); + public TextTemplate Pattern(string pattern) + { + _checks.Add(new PatternTextTemplateCheck(pattern)); + return this; + } + public TextTemplate Bold(bool bold = true) + { + _checks.Add(new BoldTextTemplateCheck(bold)); + return this; + } + public TextTemplate Italic(bool italic = true) + { + _checks.Add(new ItalicTextTemplateCheck(italic)); + return this; + } + public TextTemplate Underlined(bool underlined = true) + { + _checks.Add(new UnderlinedTextTemplateCheck(underlined)); + return this; + } + public TextTemplate Size(int size) + { + _checks.Add(new SizeTextTemplateCheck(size)); + return this; + } + public TextTemplate Color(int r, int g, int b) + { + _checks.Add(new ColorTextTemplateCheck(r, g, b)); + return this; + } + public TextTemplate Center() => Alignment(AlignmentType.Center); + public TextTemplate Right() => Alignment(AlignmentType.Right); + public TextTemplate Left() => Alignment(AlignmentType.Left); + public TextTemplate Alignment(AlignmentType alignment) + { + _checks.Add(new AlignmentTextTemplateCheck(alignment)); + return this; + } + public TextTemplate Offset(double offset) + { + _checks.Add(new OffsetTextTemplateCheck(offset)); + return this; + } + public TextTemplate Bookmark(List> bookmarks) + { + _checks.Add(new BookmarkTextTemplateCheck(bookmarks)); + return this; + } + internal bool Check(TextLine textLine, Page page, List lines) + => _checks.TrueForAll(i => i.Check(textLine, page, lines)); + + private interface ITextTemplateCheck + { + bool Check(TextLine textLine, Page page, List lines); + } + private class PatternTextTemplateCheck : ITextTemplateCheck + { + private readonly Regex _regex; + public PatternTextTemplateCheck(string pattern) => _regex = new Regex(pattern, RegexOptions.Compiled); + public bool Check(TextLine textLine, Page page, List lines) => _regex.IsMatch(textLine.Text); + } + private class BoldTextTemplateCheck : ITextTemplateCheck + { + private readonly bool _bold; + public BoldTextTemplateCheck(bool bold) => _bold = bold; + public bool Check(TextLine textLine, Page page, List lines) + => _bold == textLine.Words.Any(word => word.Letters.Where(letter => letter.Value != " ").Any(letter => letter.Font.IsBold || letter.Font.Weight > 500)); + } + private class UnderlinedTextTemplateCheck : ITextTemplateCheck + { + private readonly bool _underlined; + public UnderlinedTextTemplateCheck(bool underlined) => _underlined = underlined; + public bool Check(TextLine textLine, Page page, List lines) + { + return _underlined == lines.Any(l => l.Left < textLine.BoundingBox.Right && l.Right > textLine.BoundingBox.Left && l.Y < textLine.BoundingBox.Bottom && l.Y > textLine.BoundingBox.Bottom - textLine.BoundingBox.Height); + } + } + private class ItalicTextTemplateCheck : ITextTemplateCheck + { + private readonly bool _italic; + public ItalicTextTemplateCheck(bool italic) => _italic = italic; + public bool Check(TextLine textLine, Page page, List lines) + => _italic == textLine.Words.Any(word => word.Letters.Where(letter => letter.Value != " ").Any(letter => letter.Font.IsItalic)); + } + private class SizeTextTemplateCheck : ITextTemplateCheck + { + private readonly int _size; + public SizeTextTemplateCheck(int size) => _size = size; + public bool Check(TextLine textLine, Page page, List lines) + => textLine.Words.Any(word => word.Letters.Where(letter => letter.Value != " ").Any(letter => System.Math.Round(letter.PointSize) == _size)); + } + private class ColorTextTemplateCheck : ITextTemplateCheck + { + private readonly int _r; + private readonly int _g; + private readonly int _b; + public ColorTextTemplateCheck(int r, int g, int b) => (_r, _g, _b) = (r, g, b); + public bool Check(TextLine textLine, Page page, List lines) + => textLine.Words.Any(word => word.Letters.Where(letter => letter.Value != " ").Any(letter => + { + var (r, g, b) = letter.Color.ToRGBValues(); + return System.Math.Round(255 * r) == _r && System.Math.Round(g * 255) == _g && System.Math.Round(b * 255) == _b; + })); + } + private class AlignmentTextTemplateCheck : ITextTemplateCheck + { + private readonly AlignmentType _alignment; + public AlignmentTextTemplateCheck(AlignmentType alignment) => _alignment = alignment; + public bool Check(TextLine textLine, Page page, List lines) + { + switch (_alignment) + { + case AlignmentType.Left: return textLine.BoundingBox.Left < page.Width * 0.2 && textLine.BoundingBox.Left < (page.Width - textLine.BoundingBox.Right); + case AlignmentType.Center: return textLine.BoundingBox.Centroid.X > (page.Width * 2 / 5) && textLine.BoundingBox.Centroid.X < (page.Width * 3 / 5); + case AlignmentType.Right: return textLine.BoundingBox.Right > page.Width * 0.8 && textLine.BoundingBox.Left > (page.Width - textLine.BoundingBox.Right); + } + return true; + } + } + private class OffsetTextTemplateCheck : ITextTemplateCheck + { + private readonly double _offset; + public OffsetTextTemplateCheck(double offset) => _offset = offset; + public bool Check(TextLine textLine, Page page, List lines) + { + var offset = page.Width * _offset / 100; + var left = offset - 20; + var right = offset + 20; + return textLine.BoundingBox.Left > left && textLine.BoundingBox.Left < right; + } + } + private class BookmarkTextTemplateCheck : ITextTemplateCheck + { + private readonly List> _bookmarks; + public BookmarkTextTemplateCheck(List> bookmarks) => _bookmarks = bookmarks; + public bool Check(TextLine textLine, Page page, List lines) + => GetMatchingBookmark((decimal)textLine.BoundingBox.Top, (decimal)textLine.BoundingBox.Bottom, page.Number) != null; + private List GetMatchingBookmark(decimal top, decimal bottom, int pageNumber) + { + if (_bookmarks == null) return null; + var heigh = top - bottom; + return _bookmarks.FirstOrDefault(i => + { + var bBottom = i[0].Destination.Coordinates.Bottom ?? (i[0].Destination.Coordinates.Top.Value - heigh); + var bTop = i[0].Destination.Coordinates.Top ?? (i[0].Destination.Coordinates.Bottom.Value + heigh); + return i[0].PageNumber == pageNumber && bBottom <= top && bTop >= bottom; + }); + } + } + } + public enum AlignmentType + { + Left, + Right, + Center + } +} diff --git a/src/Paillave.Pdf/Tools.cs b/src/Paillave.Pdf/Tools.cs new file mode 100644 index 00000000..a31be9da --- /dev/null +++ b/src/Paillave.Pdf/Tools.cs @@ -0,0 +1,17 @@ +using System.Diagnostics; +using System.IO; + +namespace Paillave.Pdf +{ + internal class Tools + { + public static void OpenFile(string content, string extension) + { + string tempFilePath = Path.GetTempFileName(); + string htmlTempFilePath = Path.ChangeExtension(tempFilePath, extension); + File.Move(tempFilePath, htmlTempFilePath); + File.WriteAllText(htmlTempFilePath, content); + new Process { StartInfo = new ProcessStartInfo(htmlTempFilePath) { UseShellExecute = true } }.Start(); + } + } +} \ No newline at end of file diff --git a/src/Tutorials/BlogTutorial/BlogTutorial.csproj b/src/Tutorials/BlogTutorial/BlogTutorial.csproj index abf9d02c..eeb8921b 100644 --- a/src/Tutorials/BlogTutorial/BlogTutorial.csproj +++ b/src/Tutorials/BlogTutorial/BlogTutorial.csproj @@ -16,16 +16,16 @@ - + runtime; build; native; contentfiles; analyzers; buildtransitive all - + Exe - net5.0 + net6.0 diff --git a/src/Tutorials/Paillave.Etl.Samples/Paillave.Etl.Samples.csproj b/src/Tutorials/Paillave.Etl.Samples/Paillave.Etl.Samples.csproj index ecd17c75..b56f3ef9 100644 --- a/src/Tutorials/Paillave.Etl.Samples/Paillave.Etl.Samples.csproj +++ b/src/Tutorials/Paillave.Etl.Samples/Paillave.Etl.Samples.csproj @@ -15,16 +15,16 @@ - + runtime; build; native; contentfiles; analyzers; buildtransitive all - + Exe - net5.0 + net6.0 diff --git a/src/Tutorials/SimpleTutorial/Program.cs b/src/Tutorials/SimpleTutorial/Program.cs index 4e5245c8..77a296a4 100644 --- a/src/Tutorials/SimpleTutorial/Program.cs +++ b/src/Tutorials/SimpleTutorial/Program.cs @@ -7,6 +7,7 @@ using Paillave.Etl.SqlServer; using System.Data.SqlClient; using System.Linq; +using Paillave.Etl.Bloomberg; namespace SimpleTutorial { @@ -25,7 +26,7 @@ private static void DefineProcess1(ISingleStream contextStream) { contextStream .CrossApplyFolderFiles("get files", "*.txt") - .CrossApply("parse file", BloombergValuesProvider.Create(FlatFileDefinition.Create(i => new + .CrossApply("parse file", BloombergValuesProvider.Create(i => new { CountryFullName = i.ToColumn("COUNTRY_FULL_NAME"), IndustrySector = i.ToColumn("INDUSTRY_SECTOR"), @@ -45,7 +46,7 @@ private static void DefineProcess1(ISingleStream contextStream) SecurityTyp = i.ToColumn("SECURITY_TYP"), PeRatio = i.ToNumberColumn("PE_RATIO", "."), EqyDvdYld12m = i.ToNumberColumn("EQY_DVD_YLD_12M", "."), - }))) + })) .Do("write to console", i => { diff --git a/src/Tutorials/SimpleTutorial/SimpleTutorial.csproj b/src/Tutorials/SimpleTutorial/SimpleTutorial.csproj index abf9d02c..a8171fae 100644 --- a/src/Tutorials/SimpleTutorial/SimpleTutorial.csproj +++ b/src/Tutorials/SimpleTutorial/SimpleTutorial.csproj @@ -3,6 +3,7 @@ + @@ -16,16 +17,16 @@ - + runtime; build; native; contentfiles; analyzers; buildtransitive all - + Exe - net5.0 + net6.0