Skip to content

Commit

Permalink
feat: script injecting and tools
Browse files Browse the repository at this point in the history
  • Loading branch information
nomagick committed Nov 8, 2024
1 parent bd629a8 commit 22647a0
Show file tree
Hide file tree
Showing 6 changed files with 300 additions and 38 deletions.
42 changes: 35 additions & 7 deletions backend/functions/src/cloud-functions/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import { JSDomControl } from '../services/jsdom';
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';

export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean;
withIframe?: boolean | 'quoted';
withShadowDom?: boolean;
targetSelector?: string | string[];
removeSelector?: string | string[];
Expand Down Expand Up @@ -69,6 +69,10 @@ export class CrawlerHost extends RPCHost {
// Potential privacy issue, dont cache if cookies are used
return;
}
if (options.injectFrameScripts?.length || options.injectPageScripts?.length) {
// Potentially mangeled content, dont cache if scripts are injected
return;
}
if (options.locale) {
Reflect.set(snapshot, 'locale', options.locale);
}
Expand Down Expand Up @@ -237,7 +241,7 @@ export class CrawlerHost extends RPCHost {
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
}
}
const crawlOpts = this.configure(crawlerOptions);
const crawlOpts = await this.configure(crawlerOptions);


if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
Expand Down Expand Up @@ -284,7 +288,7 @@ export class CrawlerHost extends RPCHost {
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);

if (crawlerOptions.timeout === undefined) {
if (crawlerOptions.isEarlyReturnApplicable()) {
return formatted;
}

Expand Down Expand Up @@ -315,7 +319,7 @@ export class CrawlerHost extends RPCHost {
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);

if (crawlerOptions.timeout === undefined) {
if (crawlerOptions.isEarlyReturnApplicable()) {
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {

return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
Expand Down Expand Up @@ -557,8 +561,8 @@ export class CrawlerHost extends RPCHost {

let cache;

const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
if (cacheTolerance && !crawlOpts?.cookies?.length) {
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
cache = await this.queryCache(urlToCrawl, cacheTolerance);
}

Expand Down Expand Up @@ -665,7 +669,7 @@ export class CrawlerHost extends RPCHost {
}
}

configure(opts: CrawlerOptions) {
async configure(opts: CrawlerOptions) {

this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
Expand Down Expand Up @@ -697,6 +701,30 @@ export class CrawlerHost extends RPCHost {
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
}

if (opts.injectFrameScript?.length) {
crawlOpts.injectFrameScripts = (await Promise.all(
opts.injectFrameScript.map((x) => {
if (URL.canParse(x)) {
return fetch(x).then((r) => r.text());
}

return x;
})
)).filter(Boolean);
}

if (opts.injectPageScript?.length) {
crawlOpts.injectPageScripts = (await Promise.all(
opts.injectPageScript.map((x) => {
if (URL.canParse(x)) {
return fetch(x).then((r) => r.text());
}

return x;
})
)).filter(Boolean);
}

return crawlOpts;
}

Expand Down
4 changes: 2 additions & 2 deletions backend/functions/src/cloud-functions/searcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ export class SearcherHost extends RPCHost {

delete crawlerOptions.html;

const crawlOpts = this.crawler.configure(crawlerOptions);
const crawlOpts = await this.crawler.configure(crawlerOptions);
const searchQuery = braveSearchExplicitOperators.addTo(q || noSlashPath);
const r = await this.cachedWebSearch({
q: searchQuery,
Expand All @@ -156,7 +156,7 @@ export class SearcherHost extends RPCHost {
}

const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
{ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs },
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
count,
);

Expand Down
50 changes: 46 additions & 4 deletions backend/functions/src/dto/scrapping-options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,9 @@ export class CrawlerOptions extends AutoCastable {

@Prop({
default: false,
type: [String, Boolean]
})
withIframe!: boolean;
withIframe!: boolean | 'quoted';

@Prop({
default: false,
Expand All @@ -211,6 +212,16 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
userAgent?: string;

@Prop({
arrayOf: String,
})
injectPageScript?: string[];

@Prop({
arrayOf: String,
})
injectFrameScript?: string[];

@Prop({
validate: (v: number) => v > 0 && v <= 180,
type: Number,
Expand Down Expand Up @@ -293,7 +304,7 @@ export class CrawlerOptions extends AutoCastable {
}
const withIframe = ctx?.req.get('x-with-iframe');
if (withIframe !== undefined) {
instance.withIframe = Boolean(withIframe);
instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
}
if (instance.withIframe) {
instance.timeout ??= null;
Expand Down Expand Up @@ -330,6 +341,37 @@ export class CrawlerOptions extends AutoCastable {

return instance;
}

isEarlyReturnApplicable() {
if (this.timeout !== undefined) {
return false;
}
if (this.waitForSelector?.length) {
return false;
}
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false;
}

return true;
}

isCacheQueryApplicable() {
if (this.noCache) {
return false;
}
if (this.cacheTolerance === 0) {
return false;
}
if (this.setCookies?.length) {
return false;
}
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false;
}

return true;
}
}

export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
Expand All @@ -347,14 +389,14 @@ function filterSelector(s?: string | string[]) {
return s;
}
const sr = Array.isArray(s) ? s : [s];
const selectors = sr.filter((i)=> {
const selectors = sr.filter((i) => {
const innerSelectors = i.split(',').map((s) => s.trim());
const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
if (someViolation) {
return false;
}
return true;
})
});

return selectors;
};
8 changes: 7 additions & 1 deletion backend/functions/src/services/jsdom.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,13 @@ export class JSDomControl extends AsyncService {
jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => {
const src = x.getAttribute('src');
const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
if (thisSnapshot?.html) {
if (options?.withIframe === 'quoted') {
const blockquoteElem = jsdom.window.document.createElement('blockquote');
const preElem = jsdom.window.document.createElement('pre');
preElem.innerHTML = thisSnapshot?.text || '';
blockquoteElem.appendChild(preElem);
x.replaceWith(blockquoteElem);
} else if (thisSnapshot?.html) {
x.innerHTML = thisSnapshot.html;
x.querySelectorAll('script, style').forEach((s) => s.remove());
x.querySelectorAll('[src]').forEach((el) => {
Expand Down
Loading

0 comments on commit 22647a0

Please sign in to comment.