updoots

0xDO · Nov 14, 2023 · e07b50d · e07b50d
1 parent f15d5dd
commit e07b50d
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 5,434 deletions.
diff --git a/README.md b/README.md
@@ -1,9 +1,79 @@
-# Getting started with Crawlee
+# GPT Crawler
 
-This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
+Crawl a site to generate knowledge files to create your own custom GPT
 
-You can find more examples and documentation at the following links:
+## Get started
 
-- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
-- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
-- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
+### Prerequisites
+
+Be sure you have Node.js >= 16 installed
+
+### Clone the repo
+
+```sh
+git clone https://github.com/bridgeproject/gpt-crawler
+```
+
+### Configure the crawler
+
+Open [config.ts](config.ts) and edit the `url` and `selectors` properties to match your needs.
+
+E.g. to crawl the Builder.io docs to make our custom GPT you can use:
+
+```ts
+export const config: Config = {
+  url: "https://www.builder.io/c/docs/developers",
+  match: "https://www.builder.io/c/docs/**",
+  selector: `.docs-builder-container`,
+  maxPagesToCrawl: 1000,
+  outputFileName: "output.json",
+};
+```
+
+See the top of the file for the type definition for what you can configure:
+
+```ts
+type Config = {
+  /** URL to start the crawl */
+  url: string;
+  /** Pattern to match against for links on a page to subsequently crawl */
+  match: string;
+  /** Selector to grab the inner text from */
+  selector: string;
+  /** Don't crawl more than this many pages */
+  maxPagesToCrawl: number;
+  /** File name for the finished data */
+  outputFileName: string;
+  /** Optional function to run for each page found */
+  onVisitPage?: (options: {
+    page: Page;
+    pushData: (data: any) => Promise<void>;
+  }) => Promise<void>;
+};
+```
+
+### Run your crawler
+
+```sh
+npm start
+```
+
+### Upload your data to OpenAI
+
+The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom GPT or custom GPT.
+
+## Contributing
+
+Know how to make this project better? Send a PR!
+
+<br>
+<br>
+
+<p align="center">
+   <a href="https://www.builder.io/m/developers">
+      <picture>
+         <source media="(prefers-color-scheme: dark)" srcset="https://user-images.githubusercontent.com/844291/230786554-eb225eeb-2f6b-4286-b8c2-535b1131744a.png">
+         <img width="250" alt="Made with love by Builder.io" src="https://user-images.githubusercontent.com/844291/230786555-a58479e4-75f3-4222-a6eb-74c5af953eac.png">
+       </picture>
+   </a>
+</p>
diff --git a/config.ts b/config.ts
@@ -1,15 +1,27 @@
+import { Page } from "playwright";
+
 type Config = {
+  /** URL to start the crawl */
   url: string;
+  /** Pattern to match against for links on a page to subsequently crawl */
   match: string;
+  /** Selector to grab the inner text from */
   selector: string;
+  /** Don't crawl more than this many pages */
   maxPagesToCrawl: number;
+  /** File name for the finished data */
   outputFileName: string;
+  /** Optional function to run for each page found */
+  onVisitPage?: (options: {
+    page: Page;
+    pushData: (data: any) => Promise<void>;
+  }) => Promise<void>;
 };
 
-export const config = {
-  url: "https://github.com/builderio/builder",
-  match: "https://github.com/BuilderIO/builder/tree/main/**",
-  selector: `#readme,[data-selector="repos-split-pane-content"]`,
+export const config: Config = {
+  url: "https://www.builder.io/c/docs/developers",
+  match: "https://www.builder.io/c/docs/**",
+  selector: `.docs-builder-container`,
   maxPagesToCrawl: 1000,
-  outputFileName: "github.json",
-} satisfies Config;
+  outputFileName: "output.json",
+};
diff --git a/forum.json b/forum.json