Skip to content

Commit

Permalink
Caleb: got it to a testable state I believe
Browse files Browse the repository at this point in the history
  • Loading branch information
calebpeffer committed Apr 28, 2024
1 parent 6ee1f2d commit 0649772
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 31 deletions.
44 changes: 43 additions & 1 deletion apps/api/src/__tests__/e2e_withAuth/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";


describe("E2E Tests for API Routes", () => {
describe.only("E2E Tests for API Routes", () => {
beforeAll(() => {
process.env.USE_DB_AUTHENTICATION = "true";
});
Expand Down Expand Up @@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002";
}, 60000); // 60 seconds
});

describe("POST /v0/scrape with LLM Extraction", () => {
it("should extract data using LLM extraction mode", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
pageOptions: {
onlyMainContent: true
},
extractorOptions: {
extractorMode: "llm-extract",
extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
extractorSchema: {
type: "object",
properties: {
company_mission: {
type: "string"
},
supports_sso: {
type: "boolean"
},
is_open_source: {
type: "boolean"
}
},
required: ["company_mission", "supports_sso", "is_open_source"]
}
}
});

console.log("Response:", response.body);

expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("company_mission");
expect(response.body.data).toHaveProperty("supports_sso");
expect(response.body.data).toHaveProperty("is_open_source");
});
});

describe("GET /is-production", () => {
it("should return the production status", async () => {
const response = await request(TEST_URL).get("/is-production");
Expand Down
11 changes: 9 additions & 2 deletions apps/api/src/controllers/scrape.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { ExtractorOptions } from './../lib/entities';
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
Expand All @@ -11,7 +12,8 @@ export async function scrapeHelper(
req: Request,
team_id: string,
crawlerOptions: any,
pageOptions: any
pageOptions: any,
extractorOptions: any
): Promise<{
success: boolean;
error?: string;
Expand All @@ -35,6 +37,7 @@ export async function scrapeHelper(
...crawlerOptions,
},
pageOptions: pageOptions,
extractorOptions: extractorOptions
});

const docs = await a.getDocuments(false);
Expand Down Expand Up @@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) {
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}
const origin = req.body.origin ?? "api";

try {
Expand All @@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) {
req,
team_id,
crawlerOptions,
pageOptions
pageOptions,
extractorOptions
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
Expand Down
48 changes: 48 additions & 0 deletions apps/api/src/lib/LLM-extraction/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import Turndown from 'turndown'
import OpenAI from 'openai'
// import { LlamaModel } from 'node-llama-cpp'
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
import {
ScraperCompletionResult,
generateOpenAICompletions,
} from './models.js'
import { ExtractorOptions } from '../entities.js'

// Generate completion using OpenAI
export function generateCompletions(
documents: Document[],
extractionOptions: ExtractorOptions
): Promise < ScraperCompletionResult < T >> [] {
// const schema = zodToJsonSchema(options.schema)

const schema = extractionOptions.extractionSchema;
const prompt = extractionOptions.extractionPrompt;

const loader = documents.map(async (document, i) => {
switch (this.client.constructor) {
case true:
return generateOpenAICompletions<T>(
this.client as OpenAI,

schema,
options?.prompt,
options?.temperature
)

//TODO add other models
// case LlamaModel:
// return generateLlamaCompletions<T>(
// this.client,
// await page,
// schema,
// options?.prompt,
// options?.temperature
// )
default:
throw new Error('Invalid client')
}
})

return loader
}
56 changes: 34 additions & 22 deletions apps/api/src/lib/LLM-extraction/models.ts
Original file line number Diff line number Diff line change
@@ -1,48 +1,54 @@
import OpenAI from 'openai'
import { z } from 'zod'
import { ScraperLoadResult } from './types'
import { Document, ExtractorOptions } from "../../lib/entities";

// import {
// LlamaModel,
// LlamaJsonSchemaGrammar,
// LlamaContext,
// LlamaChatSession,
// GbnfJsonSchema,
// } from 'node-llama-cpp'
import { JsonSchema7Type } from 'zod-to-json-schema'
// import { JsonSchema7Type } from 'zod-to-json-schema'

export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
data: z.infer<T> | null
data: any | null
url: string
}

const defaultPrompt =
'You are a satistified web scraper. Extract the contents of the webpage'

function prepareOpenAIPage(
page: ScraperLoadResult
function prepareOpenAIDoc(
document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
if (page.mode === 'image') {
return [
{
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${page.content}` },
},
]

// Check if the markdown content exists in the document
if (!document.markdown) {
throw new Error("Markdown content is missing in the document.");
}

return [{ type: 'text', text: page.content }]
return [{ type: 'text', text: document.markdown }]
}

export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
export async function generateOpenAICompletions<T>({
client,
model = 'gpt-3.5-turbo',
document,
schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt,
temperature
}: {
client: OpenAI,
model: string = 'gpt-3.5-turbo',
page: ScraperLoadResult,
schema: JsonSchema7Type,
prompt: string = defaultPrompt,
model?: string,
document: Document,
schema: any, // This should be replaced with a proper Zod schema type when available
prompt?: string,
temperature?: number
): Promise<ScraperCompletionResult<T>> {
}): Promise<Document> {
const openai = client as OpenAI
const content = prepareOpenAIPage(page)
const content = prepareOpenAIDoc(document)

const completion = await openai.chat.completions.create({
model,
Expand All @@ -68,10 +74,16 @@ export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
})

const c = completion.choices[0].message.tool_calls[0].function.arguments

// Extract the LLM extraction content from the completion response
const llmExtraction = c;

// Return the document with the LLM extraction content added
return {
data: JSON.parse(c),
url: page.url,
}
...document,
llm_extraction: llmExtraction
};

}

// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
Expand Down
5 changes: 0 additions & 5 deletions apps/api/src/lib/LLM-extraction/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,3 @@ export type ScraperLoadOptions = {
closeOnFinish?: boolean
}

export type ScraperLoadResult = {
url: string
content: string
mode: ScraperLoadOptions['mode']
}
8 changes: 8 additions & 0 deletions apps/api/src/lib/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ export type PageOptions = {

};

export type ExtractorOptions = {
mode: "markdown" | "llm-extraction";
extractionPrompt?: string;
extractionSchema?: Record<string, any>;
}

export type SearchOptions = {
limit?: number;
tbs?: string;
Expand All @@ -38,6 +44,7 @@ export type WebScraperOptions = {
replaceAllPathsWithAbsolutePaths?: boolean;
};
pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions;
concurrentRequests?: number;
};

Expand All @@ -50,6 +57,7 @@ export class Document {
url?: string; // Used only in /search for now
content: string;
markdown?: string;
llm_extraction?: string;
createdAt?: Date;
updatedAt?: Date;
type?: string;
Expand Down
22 changes: 21 additions & 1 deletion apps/api/src/scraper/WebScraper/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
import { Progress } from "../../lib/entities";
import { scrapSingleUrl } from "./single_url";
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
Expand All @@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/imageDescription";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
import OpenAI from 'openai'


export class WebScraperDataProvider {
private urls: string[] = [""];
Expand All @@ -19,6 +21,7 @@ export class WebScraperDataProvider {
private concurrentRequests: number = 20;
private generateImgAltText: boolean = false;
private pageOptions?: PageOptions;
private extractorOptions?: ExtractorOptions;
private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";

Expand Down Expand Up @@ -191,6 +194,22 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(baseUrl, documents);
documents = documents.concat(pdfDocuments);




if(this.extractorOptions.mode === "llm-extraction") {

// const llm = new OpenAI()
// generateCompletions(
// client=llm,
// page =,
// schema=

// )


}

await this.setCachedDocuments(documents);
documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit);
Expand Down Expand Up @@ -376,6 +395,7 @@ export class WebScraperDataProvider {
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;

//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
Expand Down

0 comments on commit 0649772

Please sign in to comment.