Skip to content

Commit

Permalink
feat: allow passing base64 encoded pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleeit committed Aug 22, 2024
1 parent de50c93 commit 080056e
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 6 deletions.
16 changes: 16 additions & 0 deletions backend/functions/src/cloud-functions/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;

return;
}

if (crawlerOpts?.pdf) {
const pdfDataUrl = `data:application/pdf;base64,${encodeURIComponent(crawlerOpts.pdf)}`;
const fakeSnapshot = {
href: urlToCrawl.toString(),
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
title: '',
text: '',
pdfs: [pdfDataUrl],
} as PageSnapshot;

yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);

return;
}

let cache;

const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
Expand Down
5 changes: 5 additions & 0 deletions backend/functions/src/dto/scrapping-options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
html?: string;

@Prop({
desc: 'Base64 encoded PDF.',
})
pdf?: string;

@Prop({
default: 'default',
})
Expand Down
51 changes: 45 additions & 6 deletions backend/functions/src/services/pdf-extract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { PDFContent } from '../db/pdf';
import dayjs from 'dayjs';
import { FirebaseStorageBucketControl } from '../shared';
import { randomUUID } from 'crypto';
import { PDFDocumentLoadingTask } from 'pdfjs-dist';
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
dayjs.extend(utc); // Extend dayjs with the UTC plugin
const timezone = require('dayjs/plugin/timezone');
Expand Down Expand Up @@ -62,12 +63,45 @@ export class PDFExtractor extends AsyncService {
this.emit('ready');
}

isDataUrl(url: string) {
return /^data:.+\/(.+);base64,(.*)$/.test(url);
}

parseDataUrl(url: string) {
const matches = url.match(/^data:.+\/(.+);base64,(.*)$/);
if (!matches || matches.length !== 3) {
throw new Error('Invalid data URL');
}

if (matches[1] !== 'pdf') {
throw new Error('Invalid data URL type');
}

return {
type: matches[1],
data: matches[2]
}
}

async extract(url: string | URL) {
const loadingTask = this.pdfjs.getDocument({
url,
disableFontFace: true,
verbosity: 0
});
let loadingTask: PDFDocumentLoadingTask;

if (typeof url === 'string' && this.isDataUrl(url)) {
const { data } = this.parseDataUrl(url);

loadingTask = this.pdfjs.getDocument({
data: atob(decodeURIComponent(data)),
disableFontFace: true,
verbosity: 0
});
} else {
loadingTask = this.pdfjs.getDocument({
url,
disableFontFace: true,
verbosity: 0
});
}


const doc = await loadingTask.promise;
const meta = await doc.getMetadata();
Expand Down Expand Up @@ -237,6 +271,11 @@ export class PDFExtractor extends AsyncService {

const digest = md5Hasher.hash(url.toString());

const data = url;
if (typeof url === 'string' && this.isDataUrl(url)) {
url = `dataurl://digest:${digest}`;
}

const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];

if (cache) {
Expand Down Expand Up @@ -275,7 +314,7 @@ export class PDFExtractor extends AsyncService {
let extracted;

try {
extracted = await this.extract(url);
extracted = await this.extract(data);

const theID = randomUUID();
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
Expand Down

0 comments on commit 080056e

Please sign in to comment.