Caleb: got it to a testable state I believe

sugarforever · Apr 28, 2024 · 0649772 · 0649772
1 parent 6ee1f2d
commit 0649772
Show file tree

Hide file tree

Showing 7 changed files with 163 additions and 31 deletions.
diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -8,7 +8,7 @@ dotenv.config();
 const TEST_URL = "http://127.0.0.1:3002";
 
 
-  describe("E2E Tests for API Routes", () => {
+  describe.only("E2E Tests for API Routes", () => {
     beforeAll(() => {
       process.env.USE_DB_AUTHENTICATION = "true";
     });
@@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002";
       }, 60000); // 60 seconds
     });
 
+    describe("POST /v0/scrape with LLM Extraction", () => {
+      it("should extract data using LLM extraction mode", async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            pageOptions: {
+              onlyMainContent: true
+            },
+            extractorOptions: {
+              extractorMode: "llm-extract",
+              extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
+              extractorSchema: {
+                type: "object",
+                properties: {
+                  company_mission: {
+                    type: "string"
+                  },
+                  supports_sso: {
+                    type: "boolean"
+                  },
+                  is_open_source: {
+                    type: "boolean"
+                  }
+                },
+                required: ["company_mission", "supports_sso", "is_open_source"]
+              }
+            }
+          });
+
+        console.log("Response:", response.body);
+
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("company_mission");
+        expect(response.body.data).toHaveProperty("supports_sso");
+        expect(response.body.data).toHaveProperty("is_open_source");
+      });
+    });
+
     describe("GET /is-production", () => {
       it("should return the production status", async () => {
         const response = await request(TEST_URL).get("/is-production");

diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts
@@ -1,3 +1,4 @@
+import { ExtractorOptions } from './../lib/entities';
 import { Request, Response } from "express";
 import { WebScraperDataProvider } from "../scraper/WebScraper";
 import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
@@ -11,7 +12,8 @@ export async function scrapeHelper(
   req: Request,
   team_id: string,
   crawlerOptions: any,
-  pageOptions: any
+  pageOptions: any,
+  extractorOptions: any
 ): Promise<{
   success: boolean;
   error?: string;
@@ -35,6 +37,7 @@ export async function scrapeHelper(
       ...crawlerOptions,
     },
     pageOptions: pageOptions,
+    extractorOptions: extractorOptions
   });
 
   const docs = await a.getDocuments(false);
@@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) {
     }
     const crawlerOptions = req.body.crawlerOptions ?? {};
     const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
+    const extractorOptions = req.body.extractorOptions ?? {
+      mode: "markdown"
+    }
     const origin = req.body.origin ?? "api";
 
     try {
@@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) {
       req,
       team_id,
       crawlerOptions,
-      pageOptions
+      pageOptions,
+      extractorOptions
     );
     const endTime = new Date().getTime();
     const timeTakenInSeconds = (endTime - startTime) / 1000;

diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts
@@ -0,0 +1,48 @@
+import Turndown from 'turndown'
+import OpenAI from 'openai'
+// import { LlamaModel } from 'node-llama-cpp'
+import { z } from 'zod'
+import { zodToJsonSchema } from 'zod-to-json-schema'
+import {
+    ScraperCompletionResult,
+    generateOpenAICompletions,
+} from './models.js'
+import { ExtractorOptions } from '../entities.js'
+
+  // Generate completion using OpenAI
+export function generateCompletions(
+    documents: Document[],
+    extractionOptions: ExtractorOptions
+): Promise < ScraperCompletionResult < T >> [] {
+    // const schema = zodToJsonSchema(options.schema)
+
+    const schema = extractionOptions.extractionSchema;
+    const prompt = extractionOptions.extractionPrompt;
+
+    const loader = documents.map(async (document, i) => {
+        switch (this.client.constructor) {
+            case true:
+                return generateOpenAICompletions<T>(
+                    this.client as OpenAI,
+
+                    schema,
+                    options?.prompt,
+                    options?.temperature
+                )
+
+            //TODO add other models
+            // case LlamaModel:
+            //     return generateLlamaCompletions<T>(
+            //         this.client,
+            //         await page,
+            //         schema,
+            //         options?.prompt,
+            //         options?.temperature
+            //     )
+            default:
+                throw new Error('Invalid client')
+        }
+    })
+
+    return loader
+}
diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts
@@ -1,48 +1,54 @@
 import OpenAI from 'openai'
 import { z } from 'zod'
 import { ScraperLoadResult } from './types'
+import { Document, ExtractorOptions } from "../../lib/entities";
+
 // import {
 //   LlamaModel,
 //   LlamaJsonSchemaGrammar,
 //   LlamaContext,
 //   LlamaChatSession,
 //   GbnfJsonSchema,
 // } from 'node-llama-cpp'
-import { JsonSchema7Type } from 'zod-to-json-schema'
+// import { JsonSchema7Type } from 'zod-to-json-schema'
 
 export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
-  data: z.infer<T> | null
+  data: any | null
   url: string
 }
 
 const defaultPrompt =
   'You are a satistified web scraper. Extract the contents of the webpage'
 
-function prepareOpenAIPage(
-  page: ScraperLoadResult
+function prepareOpenAIDoc(
+  document: Document
 ): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
-  if (page.mode === 'image') {
-    return [
-      {
-        type: 'image_url',
-        image_url: { url: `data:image/jpeg;base64,${page.content}` },
-      },
-    ]
+
+  // Check if the markdown content exists in the document
+  if (!document.markdown) {
+    throw new Error("Markdown content is missing in the document.");
   }
 
-  return [{ type: 'text', text: page.content }]
+  return [{ type: 'text', text: document.markdown }]
 }
 
-export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
+export async function generateOpenAICompletions<T>({
+  client,
+  model = 'gpt-3.5-turbo',
+  document,
+  schema, //TODO - add zod dynamic type checking
+  prompt = defaultPrompt,
+  temperature
+}: {
   client: OpenAI,
-  model: string = 'gpt-3.5-turbo',
-  page: ScraperLoadResult,
-  schema: JsonSchema7Type,
-  prompt: string = defaultPrompt,
+  model?: string,
+  document: Document,
+  schema: any, // This should be replaced with a proper Zod schema type when available
+  prompt?: string,
   temperature?: number
-): Promise<ScraperCompletionResult<T>> {
+}): Promise<Document> {
   const openai = client as OpenAI
-  const content = prepareOpenAIPage(page)
+  const content = prepareOpenAIDoc(document)
 
   const completion = await openai.chat.completions.create({
     model,
@@ -68,10 +74,16 @@ export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
   })
 
   const c = completion.choices[0].message.tool_calls[0].function.arguments
+
+  // Extract the LLM extraction content from the completion response
+  const llmExtraction = c;
+
+  // Return the document with the LLM extraction content added
   return {
-    data: JSON.parse(c),
-    url: page.url,
-  }
+    ...document,
+    llm_extraction: llmExtraction
+  };
+
 }
 
 // export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(

diff --git a/apps/api/src/lib/LLM-extraction/types.ts b/apps/api/src/lib/LLM-extraction/types.ts
@@ -3,8 +3,3 @@ export type ScraperLoadOptions = {
     closeOnFinish?: boolean
 }
 
-export type ScraperLoadResult = {
-    url: string
-    content: string
-    mode: ScraperLoadOptions['mode']
-}
diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts
@@ -16,6 +16,12 @@ export type PageOptions = {
 
 };
 
+export type ExtractorOptions = {
+  mode: "markdown" | "llm-extraction";
+  extractionPrompt?: string;
+  extractionSchema?: Record<string, any>;
+}
+
 export type SearchOptions = {
   limit?: number;
   tbs?: string;
@@ -38,6 +44,7 @@ export type WebScraperOptions = {
     replaceAllPathsWithAbsolutePaths?: boolean;
   };
   pageOptions?: PageOptions;
+  extractorOptions?: ExtractorOptions;
   concurrentRequests?: number;
 };
 
@@ -50,6 +57,7 @@ export class Document {
   url?: string; // Used only in /search for now
   content: string;
   markdown?: string;
+  llm_extraction?: string;
   createdAt?: Date;
   updatedAt?: Date;
   type?: string;

diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts
@@ -1,4 +1,4 @@
-import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
+import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
 import { Progress } from "../../lib/entities";
 import { scrapSingleUrl } from "./single_url";
 import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
@@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/imageDescription";
 import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
+import OpenAI from 'openai'
+
 
 export class WebScraperDataProvider {
   private urls: string[] = [""];
@@ -19,6 +21,7 @@ export class WebScraperDataProvider {
   private concurrentRequests: number = 20;
   private generateImgAltText: boolean = false;
   private pageOptions?: PageOptions;
+  private extractorOptions?: ExtractorOptions;
   private replaceAllPathsWithAbsolutePaths?: boolean = false;
   private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
 
@@ -191,6 +194,22 @@ export class WebScraperDataProvider {
         documents = await this.getSitemapData(baseUrl, documents);
         documents = documents.concat(pdfDocuments);
 
+
+
+
+        if(this.extractorOptions.mode === "llm-extraction") {
+
+          // const llm = new OpenAI()
+          // generateCompletions(
+          //   client=llm,
+          //   page =, 
+          //   schema= 
+
+          // )
+
+
+        }
+
         await this.setCachedDocuments(documents);
         documents = this.removeChildLinks(documents);
         documents = documents.splice(0, this.limit);
@@ -376,6 +395,7 @@ export class WebScraperDataProvider {
     this.generateImgAltText =
       options.crawlerOptions?.generateImgAltText ?? false;
     this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
+    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
     this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
 
     //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check