Skip to content

Commit

Permalink
S03E01
Browse files Browse the repository at this point in the history
  • Loading branch information
iceener committed Nov 17, 2024
1 parent b588fb3 commit 212b5ec
Show file tree
Hide file tree
Showing 15 changed files with 2,626 additions and 0 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"audio": "bun run audio/app.ts",
"audio-map": "bun run audio-map/app.ts",
"text-splitter": "bun run text-splitter/app.ts",
"unstructured": "bun run unstructured/app.ts",
"video": "bun run video/app.ts",
"dev": "vite",
"build": "vite build",
Expand Down
207 changes: 207 additions & 0 deletions text-splitter/TextService.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
import { createByModelName } from '@microsoft/tiktokenizer';

interface IDoc {
text: string;
metadata: {
tokens: number;
headers: Headers;
urls: string[];
images: string[];
};
}

interface Headers {
[key: string]: string[];
}

export class TextSplitter {
private tokenizer?: Awaited<ReturnType<typeof createByModelName>>;

private readonly MODEL_NAME: string;
private readonly SPECIAL_TOKENS = new Map<string, number>([
['<|im_start|>', 100264],
['<|im_end|>', 100265],
['<|im_sep|>', 100266],
]);

constructor(modelName: string = 'gpt-4') {
this.MODEL_NAME = modelName;
}

private async initializeTokenizer(): Promise<void> {
if (!this.tokenizer) {
this.tokenizer = await createByModelName(this.MODEL_NAME, this.SPECIAL_TOKENS);
}
}

private countTokens(text: string): number {
if (!this.tokenizer) {
throw new Error('Tokenizer not initialized');
}
const formattedContent = this.formatForTokenization(text);
const tokens = this.tokenizer.encode(formattedContent, Array.from(this.SPECIAL_TOKENS.keys()));
return tokens.length;
}

private formatForTokenization(text: string): string {
return `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant<|im_end|>`;
}

async split(text: string, limit: number): Promise<IDoc[]> {
console.log(`Starting split process with limit: ${limit} tokens`);
await this.initializeTokenizer();
const chunks: IDoc[] = [];
let position = 0;
const totalLength = text.length;
const currentHeaders: Headers = {};

while (position < totalLength) {
console.log(`Processing chunk starting at position: ${position}`);
const { chunkText, chunkEnd } = this.getChunk(text, position, limit);
const tokens = this.countTokens(chunkText);
console.log(`Chunk tokens: ${tokens}`);

const headersInChunk = this.extractHeaders(chunkText);
this.updateCurrentHeaders(currentHeaders, headersInChunk);

const { content, urls, images } = this.extractUrlsAndImages(chunkText);

chunks.push({
text: content,
metadata: {
tokens,
headers: { ...currentHeaders },
urls,
images,
},
});

console.log(`Chunk processed. New position: ${chunkEnd}`);
position = chunkEnd;
}

console.log(`Split process completed. Total chunks: ${chunks.length}`);
return chunks;
}

private getChunk(text: string, start: number, limit: number): { chunkText: string; chunkEnd: number } {
console.log(`Getting chunk starting at ${start} with limit ${limit}`);

// Account for token overhead due to formatting
const overhead = this.countTokens(this.formatForTokenization('')) - this.countTokens('');

// Initial tentative end position
let end = Math.min(start + Math.floor((text.length - start) * limit / this.countTokens(text.slice(start))), text.length);

// Adjust end to avoid exceeding token limit
let chunkText = text.slice(start, end);
let tokens = this.countTokens(chunkText);

while (tokens + overhead > limit && end > start) {
console.log(`Chunk exceeds limit with ${tokens + overhead} tokens. Adjusting end position...`);
end = this.findNewChunkEnd(text, start, end);
chunkText = text.slice(start, end);
tokens = this.countTokens(chunkText);
}

// Adjust chunk end to align with newlines without significantly reducing size
end = this.adjustChunkEnd(text, start, end, tokens + overhead, limit);

chunkText = text.slice(start, end);
tokens = this.countTokens(chunkText);
console.log(`Final chunk end: ${end}`);
return { chunkText, chunkEnd: end };
}

private adjustChunkEnd(text: string, start: number, end: number, currentTokens: number, limit: number): number {
const minChunkTokens = limit * 0.8; // Minimum chunk size is 80% of limit

const nextNewline = text.indexOf('\n', end);
const prevNewline = text.lastIndexOf('\n', end);

// Try extending to next newline
if (nextNewline !== -1 && nextNewline < text.length) {
const extendedEnd = nextNewline + 1;
const chunkText = text.slice(start, extendedEnd);
const tokens = this.countTokens(chunkText);
if (tokens <= limit && tokens >= minChunkTokens) {
console.log(`Extending chunk to next newline at position ${extendedEnd}`);
return extendedEnd;
}
}

// Try reducing to previous newline
if (prevNewline > start) {
const reducedEnd = prevNewline + 1;
const chunkText = text.slice(start, reducedEnd);
const tokens = this.countTokens(chunkText);
if (tokens <= limit && tokens >= minChunkTokens) {
console.log(`Reducing chunk to previous newline at position ${reducedEnd}`);
return reducedEnd;
}
}

// Return original end if adjustments aren't suitable
return end;
}

private findNewChunkEnd(text: string, start: number, end: number): number {
// Reduce end position to try to fit within token limit
let newEnd = end - Math.floor((end - start) / 10); // Reduce by 10% each iteration
if (newEnd <= start) {
newEnd = start + 1; // Ensure at least one character is included
}
return newEnd;
}

private extractHeaders(text: string): Headers {
const headers: Headers = {};
const headerRegex = /(^|\n)(#{1,6})\s+(.*)/g;
let match;

while ((match = headerRegex.exec(text)) !== null) {
const level = match[2].length;
const content = match[3].trim();
const key = `h${level}`;
headers[key] = headers[key] || [];
headers[key].push(content);
}

return headers;
}

private updateCurrentHeaders(current: Headers, extracted: Headers): void {
for (let level = 1; level <= 6; level++) {
const key = `h${level}`;
if (extracted[key]) {
current[key] = extracted[key];
this.clearLowerHeaders(current, level);
}
}
}

private clearLowerHeaders(headers: Headers, level: number): void {
for (let l = level + 1; l <= 6; l++) {
delete headers[`h${l}`];
}
}

private extractUrlsAndImages(text: string): { content: string; urls: string[]; images: string[] } {
const urls: string[] = [];
const images: string[] = [];
let urlIndex = 0;
let imageIndex = 0;

const content = text
.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_match, altText, url) => {
images.push(url);
return `![${altText}]({{$img${imageIndex++}}})`;
})
.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, linkText, url) => {
urls.push(url);
return `[${linkText}]({{$url${urlIndex++}}})`;
});

return { content, urls, images };
}
}
45 changes: 45 additions & 0 deletions text-splitter/app.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import fs from 'fs';
import path from 'path';
import { TextSplitter } from "./TextService";

const splitter = new TextSplitter();

async function processFile(filePath: string) {
const text = fs.readFileSync(filePath, 'utf-8');
const docs = await splitter.split(text, 1000);
const jsonFilePath = path.join(path.dirname(filePath), `${path.basename(filePath, '.md')}.json`);
fs.writeFileSync(jsonFilePath, JSON.stringify(docs, null, 2));

const chunkSizes = docs.map(doc => doc.metadata.tokens);
const avgChunkSize = chunkSizes.reduce((sum, size) => sum + size, 0) / chunkSizes.length;
const minChunkSize = Math.min(...chunkSizes);
const maxChunkSize = Math.max(...chunkSizes);
const medianChunkSize = chunkSizes.sort((a, b) => a - b)[Math.floor(chunkSizes.length / 2)];

return {
file: path.basename(filePath),
avgChunkSize: avgChunkSize.toFixed(2),
medianChunkSize,
minChunkSize,
maxChunkSize,
totalChunks: chunkSizes.length
};
}

async function main() {
// Get all markdown files in the current directory
const directoryPath = path.join(__dirname);
const files = fs.readdirSync(directoryPath);
const reports = [];

for (const file of files) {
if (path.extname(file) === '.md') {
const report = await processFile(path.join(directoryPath, file));
reports.push(report);
}
}

console.table(reports);
}

main().catch(console.error);
Loading

0 comments on commit 212b5ec

Please sign in to comment.