Skip to content

Commit

Permalink
fix: expose publishedTime if possible
Browse files Browse the repository at this point in the history
  • Loading branch information
nomagick committed Apr 17, 2024
1 parent 6e36f0a commit a211366
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 16 deletions.
8 changes: 7 additions & 1 deletion backend/functions/src/cloud-functions/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,18 @@ export class CrawlerHost extends RPCHost {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
url: nominalUrl || snapshot.href?.trim(),
content: cleanText,
publishedTime: snapshot.parsed?.publishedTime || undefined,

toString() {
const mixins = [];
if (this.publishedTime) {
mixins.push(`Published Time: ${this.publishedTime}`);
}

return `Title: ${this.title}
URL Source: ${this.url}
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
Markdown Content:
${this.content}
`;
Expand Down
32 changes: 17 additions & 15 deletions backend/functions/src/services/puppeteer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,25 @@ export interface ImgBrief {
alt?: string;
}

export interface ReadabilityParsed {
title: string;
content: string;
textContent: string;
length: number;
excerpt: string;
byline: string;
dir: string;
siteName: string;
lang: string;
publishedTime: string;
}

export interface PageSnapshot {
title: string;
href: string;
html: string;
text: string;
parsed?: {
title: string;
content: string;
textContent: string;
length: number;
excerpt: string;
byline: string;
dir: string;
siteName: string;
lang: string;
publishedTime: string;
} | null;
parsed?: Partial<ReadabilityParsed> | null;
screenshot?: Buffer;
imgs?: ImgBrief[];
}
Expand Down Expand Up @@ -121,7 +123,7 @@ export class PuppeteerControl extends AsyncService {
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
preparations.push(page.setBypassCSP(true));
preparations.push(page.setViewport({ width: 1920, height: 1080 }));
preparations.push(page.setViewport({ width: 1024, height: 1024 }));
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
page.emit('snapshot', snapshot);
}));
Expand Down Expand Up @@ -262,15 +264,15 @@ function giveSnapshot() {
}
screenshot = await page.screenshot({
type: 'jpeg',
quality: 85,
quality: 75,
});
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
if (!snapshot.title || !snapshot.parsed?.content) {
const salvaged = await this.salvage(url, page);
if (salvaged) {
screenshot = await page.screenshot({
type: 'jpeg',
quality: 85,
quality: 75,
});
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
}
Expand Down

0 comments on commit a211366

Please sign in to comment.