Skip to content

Commit

Permalink
fix: target selector
Browse files Browse the repository at this point in the history
  • Loading branch information
nomagick committed Sep 17, 2024
1 parent e27bcac commit c36aa73
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 4 deletions.
12 changes: 9 additions & 3 deletions backend/functions/src/cloud-functions/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ export class CrawlerHost extends RPCHost {
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}

Expand All @@ -287,12 +287,15 @@ export class CrawlerHost extends RPCHost {
return formatted;
}

if (chargeAmount && scrapped.pdfs?.length) {
if (chargeAmount && scrapped?.pdfs?.length) {
return formatted;
}
}

if (!lastScrapped) {
if (crawlOpts.targetSelector) {
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
}
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}

Expand All @@ -304,7 +307,7 @@ export class CrawlerHost extends RPCHost {

for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}

Expand All @@ -330,6 +333,9 @@ export class CrawlerHost extends RPCHost {
}

if (!lastScrapped) {
if (crawlOpts.targetSelector) {
throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
}
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}

Expand Down
8 changes: 8 additions & 0 deletions backend/functions/src/services/jsdom.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ export class JSDomControl extends AsyncService {
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
}

let bewareTargetContentDoesNotExist = false;
if (Array.isArray(options?.targetSelector)) {
bewareTargetContentDoesNotExist = true;
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
x.forEach((el) => {
if (!allNodes.includes(el)) {
Expand All @@ -87,6 +89,7 @@ export class JSDomControl extends AsyncService {
});
}
} else if (options?.targetSelector) {
bewareTargetContentDoesNotExist = true;
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
if (!allNodes.includes(el)) {
allNodes.push(el);
Expand All @@ -97,6 +100,11 @@ export class JSDomControl extends AsyncService {
}

if (!allNodes.length) {

if (bewareTargetContentDoesNotExist) {
return undefined;
}

return snapshot;
}
const textChunks: string[] = [];
Expand Down
2 changes: 1 addition & 1 deletion thinapps-shared

0 comments on commit c36aa73

Please sign in to comment.