forked from BuilderIO/gpt-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
134 additions
and
5,434 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,79 @@ | ||
# Getting started with Crawlee | ||
# GPT Crawler | ||
|
||
This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev). | ||
Crawl a site to generate knowledge files to create your own custom GPT | ||
|
||
You can find more examples and documentation at the following links: | ||
## Get started | ||
|
||
- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee | ||
- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler) | ||
- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler) | ||
### Prerequisites | ||
|
||
Be sure you have Node.js >= 16 installed | ||
|
||
### Clone the repo | ||
|
||
```sh | ||
git clone https://github.com/bridgeproject/gpt-crawler | ||
``` | ||
|
||
### Configure the crawler | ||
|
||
Open [config.ts](config.ts) and edit the `url` and `selectors` properties to match your needs. | ||
|
||
E.g. to crawl the Builder.io docs to make our custom GPT you can use: | ||
|
||
```ts | ||
export const config: Config = { | ||
url: "https://www.builder.io/c/docs/developers", | ||
match: "https://www.builder.io/c/docs/**", | ||
selector: `.docs-builder-container`, | ||
maxPagesToCrawl: 1000, | ||
outputFileName: "output.json", | ||
}; | ||
``` | ||
|
||
See the top of the file for the type definition for what you can configure: | ||
|
||
```ts | ||
type Config = { | ||
/** URL to start the crawl */ | ||
url: string; | ||
/** Pattern to match against for links on a page to subsequently crawl */ | ||
match: string; | ||
/** Selector to grab the inner text from */ | ||
selector: string; | ||
/** Don't crawl more than this many pages */ | ||
maxPagesToCrawl: number; | ||
/** File name for the finished data */ | ||
outputFileName: string; | ||
/** Optional function to run for each page found */ | ||
onVisitPage?: (options: { | ||
page: Page; | ||
pushData: (data: any) => Promise<void>; | ||
}) => Promise<void>; | ||
}; | ||
``` | ||
|
||
### Run your crawler | ||
|
||
```sh | ||
npm start | ||
``` | ||
|
||
### Upload your data to OpenAI | ||
|
||
The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom GPT or custom GPT. | ||
|
||
## Contributing | ||
|
||
Know how to make this project better? Send a PR! | ||
|
||
<br> | ||
<br> | ||
|
||
<p align="center"> | ||
<a href="https://www.builder.io/m/developers"> | ||
<picture> | ||
<source media="(prefers-color-scheme: dark)" srcset="https://user-images.githubusercontent.com/844291/230786554-eb225eeb-2f6b-4286-b8c2-535b1131744a.png"> | ||
<img width="250" alt="Made with love by Builder.io" src="https://user-images.githubusercontent.com/844291/230786555-a58479e4-75f3-4222-a6eb-74c5af953eac.png"> | ||
</picture> | ||
</a> | ||
</p> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,27 @@ | ||
import { Page } from "playwright"; | ||
|
||
type Config = { | ||
/** URL to start the crawl */ | ||
url: string; | ||
/** Pattern to match against for links on a page to subsequently crawl */ | ||
match: string; | ||
/** Selector to grab the inner text from */ | ||
selector: string; | ||
/** Don't crawl more than this many pages */ | ||
maxPagesToCrawl: number; | ||
/** File name for the finished data */ | ||
outputFileName: string; | ||
/** Optional function to run for each page found */ | ||
onVisitPage?: (options: { | ||
page: Page; | ||
pushData: (data: any) => Promise<void>; | ||
}) => Promise<void>; | ||
}; | ||
|
||
export const config = { | ||
url: "https://github.com/builderio/builder", | ||
match: "https://github.com/BuilderIO/builder/tree/main/**", | ||
selector: `#readme,[data-selector="repos-split-pane-content"]`, | ||
export const config: Config = { | ||
url: "https://www.builder.io/c/docs/developers", | ||
match: "https://www.builder.io/c/docs/**", | ||
selector: `.docs-builder-container`, | ||
maxPagesToCrawl: 1000, | ||
outputFileName: "github.json", | ||
} satisfies Config; | ||
outputFileName: "output.json", | ||
}; |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.