diff --git a/.vscode/settings.json b/.vscode/settings.json index c7f24eb..c08d72f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,6 @@ { "editor.formatOnSave": true, "editor.defaultFormatter": "esbenp.prettier-vscode", - "prettier.singleQuote": false + "prettier.singleQuote": false, + "editor.acceptSuggestionOnEnter": "on" } diff --git a/index.d.ts b/index.d.ts index 54b4fed..d9b6a0d 100644 --- a/index.d.ts +++ b/index.d.ts @@ -35,6 +35,14 @@ export type Item = { text: string; }; +export type RuleAccumulator = (item: Item) => boolean | void; +export type RuleHandler = (value: T) => void; + +export interface TableResult { + matrix: string[][]; + items: Item[]; +} + export class TableParser { private rows: { [key: string]: Item[] }; constructor(); @@ -47,3 +55,27 @@ export class TableParser { getCleanMatrix(options?: { collisionSeparator: string }): string[][]; renderMatrix(): string; } + +export class Rule { + static on(regexp: RegExp): Rule; + static after(regexp: RegExp): Rule; + static makeItemProcessor(rules: Rule[]): (item: DataEntry) => void; + static addAccumulator(methodName: string, methodBuilder: Function): void; + + constructor(regexp: RegExp); + + // Accumulator methods + extractRegexpValues(): Rule; + parseNextItemValue(): Rule; + accumulateAfterHeading(): Rule; + accumulateFromSameX(): Rule; + parseColumns(...args: any[]): Rule; + parseTable(columnCount: number): Rule & { + then(handler: (result: TableResult) => void): Rule; + }; + + then(handler: RuleHandler): Rule; + + private test(item: Item): RuleAccumulator | undefined; + private whenDone(callback: () => void): void; +} \ No newline at end of file diff --git a/index.js b/index.js index ea38662..27553f3 100644 --- a/index.js +++ b/index.js @@ -1,12 +1,14 @@ +import * as parseTableExports from "./lib/parseTable.js"; +import * as parseColumnsExports from "./lib/parseColumns.js"; + export { PdfReader } from "./PdfReader.js"; export { Rule } from "./Rule.js"; export * as LOG from "./lib/LOG.js"; -import * as parseTableExports from "./lib/parseTable.js"; export const parseTable = Object.assign( parseTableExports.parseTable, parseTableExports ); -import * as parseColumnsExports from "./lib/parseColumns.js"; + export const parseColumns = Object.assign( parseColumnsExports.parseColumns, parseColumnsExports diff --git a/package-lock.json b/package-lock.json index 0a2c104..4cc6bb0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,7 +22,8 @@ "execa": "^6.1.0", "prettier": "2.6.1", "rollup": "^4.19.1", - "semantic-release": "^19.0.2" + "semantic-release": "^19.0.2", + "typescript": "^5.1.6" }, "engines": { "node": ">=14" @@ -8127,6 +8128,19 @@ "node": ">=8" } }, + "node_modules/typescript": { + "version": "5.1.6", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.1.6.tgz", + "integrity": "sha512-zaWCozRZ6DLEWAWFrVDz1H6FVXzUSfTy5FUMWsQlU8Ym5JP9eO4xkTIROFCQvhQf61z6O/G6ugw3SgAnvvm+HA==", + "dev": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, "node_modules/uglify-js": { "version": "3.15.3", "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.15.3.tgz", @@ -14240,6 +14254,12 @@ "integrity": "sha512-q+MB8nYR1KDLrgr4G5yemftpMC7/QLqVndBmEEdqzmNj5dcFOO4Oo8qlwZE3ULT3+Zim1F8Kq4cBnikNhlCMlg==", "dev": true }, + "typescript": { + "version": "5.1.6", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.1.6.tgz", + "integrity": "sha512-zaWCozRZ6DLEWAWFrVDz1H6FVXzUSfTy5FUMWsQlU8Ym5JP9eO4xkTIROFCQvhQf61z6O/G6ugw3SgAnvvm+HA==", + "dev": true + }, "uglify-js": { "version": "3.15.3", "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.15.3.tgz", diff --git a/package.json b/package.json index 2d99c8f..2349abb 100644 --- a/package.json +++ b/package.json @@ -55,7 +55,8 @@ "execa": "^6.1.0", "prettier": "2.6.1", "semantic-release": "^19.0.2", - "rollup": "^4.19.1" + "rollup": "^4.19.1", + "typescript": "^5.1.6" }, "engines": { "node": ">=14" diff --git a/test/test.ts b/test/test.ts new file mode 100644 index 0000000..bb7aa45 --- /dev/null +++ b/test/test.ts @@ -0,0 +1,10 @@ +import * as lib from "../types/index.js"; +const PdfReader = lib.PdfReader; + +const TESTFILE = "./test/sample.pdf"; + +new PdfReader().parseFileItems(TESTFILE, (err, item) => { + if (err) console.error("error:", err); + else if (!item) console.warn("end of file"); + else if (item.text) console.log(item.text); +}); diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..4630b84 --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,111 @@ +{ + "compilerOptions": { + /* Visit https://aka.ms/tsconfig to read more about this file */ + + /* Projects */ + // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ + // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ + // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ + // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ + // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ + // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ + + /* Language and Environment */ + "target": "es2016" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */, + // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ + // "jsx": "preserve", /* Specify what JSX code is generated. */ + // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ + // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ + // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ + // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ + // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ + // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ + // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ + // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ + // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ + + /* Modules */ + "module": "commonjs" /* Specify what module code is generated. */, + // "rootDir": "./", /* Specify the root folder within your source files. */ + // "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */ + // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ + // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ + // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ + // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ + "types": [ + "./types/index.d.ts" + ] /* Specify type package names to be included without being referenced in a source file. */, + // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ + // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ + // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */ + // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ + // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ + // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ + // "resolveJsonModule": true, /* Enable importing .json files. */ + // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ + // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ + + /* JavaScript Support */ + // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ + // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ + // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ + + /* Emit */ + // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ + // "declarationMap": true, /* Create sourcemaps for d.ts files. */ + // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ + // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ + // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ + // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ + // "outDir": "./", /* Specify an output folder for all emitted files. */ + // "removeComments": true, /* Disable emitting comments. */ + // "noEmit": true, /* Disable emitting files from a compilation. */ + // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ + // "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */ + // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ + // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ + // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ + // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ + // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ + // "newLine": "crlf", /* Set the newline character for emitting files. */ + // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ + // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ + // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ + // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ + // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ + // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ + + /* Interop Constraints */ + // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ + // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */ + // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ + "esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */, + // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ + "forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */, + + /* Type Checking */ + "strict": true /* Enable all strict type-checking options. */, + // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ + // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ + // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ + // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ + // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ + // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ + // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ + // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ + // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ + // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ + // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ + // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ + // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ + // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ + // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ + // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ + // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ + // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ + + /* Completeness */ + // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ + "skipLibCheck": true /* Skip type checking all .d.ts files. */ + } +} diff --git a/types/LOG.d.ts b/types/LOG.d.ts new file mode 100644 index 0000000..ff9e5b3 --- /dev/null +++ b/types/LOG.d.ts @@ -0,0 +1,2 @@ +export function log(...args: any[]): void; +export function toggle(enabled: boolean): void; diff --git a/types/PdfReader.d.ts b/types/PdfReader.d.ts new file mode 100644 index 0000000..8153e15 --- /dev/null +++ b/types/PdfReader.d.ts @@ -0,0 +1,99 @@ +import { Transform, Readable, TransformOptions } from "stream"; +import { EventEmitter } from "events"; +import * as fs from "fs"; + +declare module "pdf2json/lib/pdf.js" { + class PDFJSClass extends EventEmitter { + constructor(needRawText: boolean); + + raiseErrorEvent(errMsg: string): string; + raiseReadyEvent(data: any): any; + parsePDFData(arrayBuffer: ArrayBuffer, password: string): void; + tryLoadFieldInfoXML(pdfFilePath: string): void; + load(pdfDocument: any, scale: number): Promise; + loadMetaData(): Promise; + parseMetaData(): void; + loadPages(): Promise; + parsePage(promisedPages: any[], id: number, scale: number): void; + getRawTextContent(): string; + getAllFieldsTypes(): any; + getMergedTextBlocksIfNeeded(): any; + destroy(): void; + } + + export = PDFJSClass; +} + +declare class ParserStream extends Transform { + static createContentStream(jsonObj: any): Readable; + static createOutputStream( + outputPath: string, + resolve: () => void, + reject: (err: Error) => void + ): fs.WriteStream; + + constructor(pdfParser: any, options?: TransformOptions); +} + +declare module "parserstream.js" { + export = ParserStream; + + export class StringifyStream extends Transform { + constructor(options?: TransformOptions); + } +} + +declare module "/pdf2json/lib/pdfconst.js" { + export const kColors: string[]; + export const kFontFaces: string[]; + export const kFontStyles: [number, number, number, number][]; +} + +declare class PDFParser extends EventEmitter { + static colorDict: any; + static fontFaceDict: any; + static fontStyleDict: any; + + constructor(context: any, needRawText: boolean, password: string); + + createParserStream(): ParserStream; // Define the type of ParserStream if needed. + loadPDF(pdfFilePath: string, verbosity: number): Promise; + parseBuffer(pdfBuffer: Buffer): void; + getRawTextContent(): string; + getRawTextContentStream(): any; // Define the type of the stream if needed. + getAllFieldsTypes(): any; // Define the type of the result. + getAllFieldsTypesStream(): any; // Define the type of the stream if needed. + getMergedTextBlocksIfNeeded(): any; // Define the type of the result. + getMergedTextBlocksStream(): any; // Define the type of the stream if needed. + destroy(): void; +} + +declare module "LOG.js" { + // No need to re-export here, just reference the functions + // Exporting and re-exporting is not necessary +} + +declare class PdfReader { + constructor(options?: PdfReaderOptions); + parseFileItems(pdfFilePath: string, itemHandler: ItemHandler): void; + parseBuffer(pdfBuffer: Buffer, itemHandler: ItemHandler): void; +} + +interface PdfReaderOptions { + password?: string; + debug?: boolean; +} + +interface Item { + file?: { path: string; buffer?: Buffer }; + page?: number; + width?: number; + height?: number; + text?: string; + x?: number; + y?: number; + w?: number; + h?: number; +} + +type ItemHandler = (error: Error | null, item?: Item) => void; diff --git a/types/Rule.d.ts b/types/Rule.d.ts new file mode 100644 index 0000000..11809b3 --- /dev/null +++ b/types/Rule.d.ts @@ -0,0 +1,48 @@ +declare module "./lib/LOG" { + export function log(...args: any[]): void; +} + +declare module "./lib/parseColumns" { + // Declare types from parseColumns.js here if needed +} + +declare module "./lib/parseTable" { + // Declare types from parseTable.js here if needed +} + +declare namespace RuleNamespace { + interface RuleConstructor { + new (regexp: RegExp): RuleInstance; + on(regexp: RegExp): RuleInstance; + after(regexp: RegExp): RuleInstance; + accumulators: { + [accumulatorName: string]: (...args: any[]) => RuleAccumulator; + // Add more accumulator declarations here + }; + addAccumulator( + methodName: string, + methodBuilder: (...args: any[]) => RuleAccumulator + ): void; + } + + interface RuleInstance { + regexp: RegExp; + methodName?: string; + accumulatorParams?: any[]; + accumulatorBuilder?: (...args: any[]) => RuleAccumulator; + terminate?: () => void; + currentItem?: any; // Define the type of currentItem if needed + accumulatorImpl?: RuleAccumulator; + skipCurrentItem?: boolean; + output?: any; // Define the type of output if needed + then(fct: (output: any) => void): RuleInstance; + test(item: any): RuleAccumulator | undefined; + whenDone(fct: () => void): void; + } + + type RuleAccumulator = (item: any) => boolean | void; +} + +declare const Rule: RuleNamespace.RuleConstructor; + +export = Rule; diff --git a/types/index.d.ts b/types/index.d.ts new file mode 100644 index 0000000..bd92316 --- /dev/null +++ b/types/index.d.ts @@ -0,0 +1,13 @@ +// Import individual type declarations +import "./LOG.d.ts"; +import "./PdfReader.d.ts"; +import "./parse.d.ts"; +import "./Rule.d.ts"; +import "./parseAsBuffer.d.ts"; + +// Export individual modules and types +export { LOG } from "./LOG"; +export { PdfReader } from "./PdfReader"; +export { parse } from "./parse"; +export { Rule } from "./Rule"; +export { parseAsBuffer } from "./parseAsBuffer"; diff --git a/types/parse.d.ts b/types/parse.d.ts new file mode 100644 index 0000000..9140b8b --- /dev/null +++ b/types/parse.d.ts @@ -0,0 +1,6 @@ +declare module "parse.js" { + export function printRawItems( + filename: string, + callback: (err?: Error) => void + ): void; +} diff --git a/types/parseAsBuffer.d.ts b/types/parseAsBuffer.d.ts new file mode 100644 index 0000000..3ddd195 --- /dev/null +++ b/types/parseAsBuffer.d.ts @@ -0,0 +1,24 @@ +import { ItemHandler, PdfReaderOptions } from "./PdfReader"; + +declare module "LOG.js" { + export function toggle(enabled: boolean): void; +} + +declare module "./index.js" { + // Update this path based on your actual index.js path + + export class PdfReader { + constructor(options?: PdfReaderOptions); + parseBuffer(pdfBuffer: Buffer, itemHandler: ItemHandler): void; + // Add any other methods or types related to PdfReader here + } +} + +declare function printRawItems( + pdfBuffer: Buffer, + callback: (err?: Error) => void +): void; + +declare module "parseAsBuffer.js" { + // No need to re-export printRawItems here +}