Skip to content

Commit

Permalink
Added captions example
Browse files Browse the repository at this point in the history
  • Loading branch information
iceener committed Nov 14, 2024
1 parent 5fb5929 commit c91f09e
Show file tree
Hide file tree
Showing 6 changed files with 609 additions and 0 deletions.
111 changes: 111 additions & 0 deletions captions/OpenAIService.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import OpenAI from "openai";
import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
import { createByModelName } from '@microsoft/tiktokenizer';

export class OpenAIService {
private openai: OpenAI;
private tokenizers: Map<string, Awaited<ReturnType<typeof createByModelName>>> = new Map();
private readonly IM_START = "<|im_start|>";
private readonly IM_END = "<|im_end|>";
private readonly IM_SEP = "<|im_sep|>";

constructor() {
this.openai = new OpenAI();
}

private async getTokenizer(modelName: string) {
if (!this.tokenizers.has(modelName)) {
const specialTokens: ReadonlyMap<string, number> = new Map([
[this.IM_START, 100264],
[this.IM_END, 100265],
[this.IM_SEP, 100266],
]);
const tokenizer = await createByModelName(modelName, specialTokens);
this.tokenizers.set(modelName, tokenizer);
}
return this.tokenizers.get(modelName)!;
}

async countTokens(messages: ChatCompletionMessageParam[], model: string = 'gpt-4o'): Promise<number> {
const tokenizer = await this.getTokenizer(model);

let formattedContent = '';
messages.forEach((message) => {
formattedContent += `${this.IM_START}${message.role}${this.IM_SEP}${message.content || ''}${this.IM_END}`;
});
formattedContent += `${this.IM_START}assistant${this.IM_SEP}`;

const tokens = tokenizer.encode(formattedContent, [this.IM_START, this.IM_END, this.IM_SEP]);
return tokens.length;
}

async completion(
messages: ChatCompletionMessageParam[],
model: string = "gpt-4o",
stream: boolean = false,
jsonMode: boolean = false,
maxTokens: number = 4096
): Promise<OpenAI.Chat.Completions.ChatCompletion | AsyncIterable<OpenAI.Chat.Completions.ChatCompletionChunk>> {
try {
const chatCompletion = await this.openai.chat.completions.create({
messages,
model,
...(model !== 'o1-mini' && model !== 'o1-preview' && {
stream,
max_tokens: maxTokens,
response_format: jsonMode ? { type: "json_object" } : { type: "text" }
})
});

if (stream) {
return chatCompletion as AsyncIterable<OpenAI.Chat.Completions.ChatCompletionChunk>;
} else {
return chatCompletion as OpenAI.Chat.Completions.ChatCompletion;
}
} catch (error) {
console.error("Error in OpenAI completion:", error);
throw error;
}
}

async calculateImageTokens(width: number, height: number, detail: 'low' | 'high'): Promise<number> {
let tokenCost = 0;

if (detail === 'low') {
tokenCost += 85;
return tokenCost;
}

const MAX_DIMENSION = 2048;
const SCALE_SIZE = 768;

// Resize to fit within MAX_DIMENSION x MAX_DIMENSION
if (width > MAX_DIMENSION || height > MAX_DIMENSION) {
const aspectRatio = width / height;
if (aspectRatio > 1) {
width = MAX_DIMENSION;
height = Math.round(MAX_DIMENSION / aspectRatio);
} else {
height = MAX_DIMENSION;
width = Math.round(MAX_DIMENSION * aspectRatio);
}
}

// Scale the shortest side to SCALE_SIZE
if (width >= height && height > SCALE_SIZE) {
width = Math.round((SCALE_SIZE / height) * width);
height = SCALE_SIZE;
} else if (height > width && width > SCALE_SIZE) {
height = Math.round((SCALE_SIZE / width) * height);
width = SCALE_SIZE;
}

// Calculate the number of 512px squares
const numSquares = Math.ceil(width / 512) * Math.ceil(height / 512);

// Calculate the token cost
tokenCost += (numSquares * 170) + 85;

return tokenCost;
}
}
156 changes: 156 additions & 0 deletions captions/app.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import type { ChatCompletion, ChatCompletionMessageParam } from "openai/resources/chat/completions";
import { OpenAIService } from './OpenAIService';
import { readFile, writeFile } from 'fs/promises';
import { join } from 'path';
import { extractImageContextSystemMessage, refineDescriptionSystemMessage, previewImageSystemMessage } from './prompts';

const openaiService = new OpenAIService();

// Update the type definition for Image
export type Image = {
alt: string;
url: string;
context: string;
description: string;
preview: string;
base64: string;
name: string;
};


async function extractImages(article: string): Promise<Image[]> {
const imageRegex = /!\[([^\]]*)\]\(([^)]+)\)/g;
const matches = [...article.matchAll(imageRegex)];

const imagePromises = matches.map(async ([, alt, url]) => {
try {
const name = url.split('/').pop() || '';
const response = await fetch(url);
if (!response.ok) throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
const arrayBuffer = await response.arrayBuffer();
const base64 = Buffer.from(arrayBuffer).toString('base64');

return {
alt,
url,
context: '',
description: '',
preview: '',
base64,
name
};
} catch (error) {
console.error(`Error processing image ${url}:`, error);
return null;
}
});

const results = await Promise.all(imagePromises);
return results.filter((link): link is Image => link !== null);
}


// Update the previewImage function signature
async function previewImage(image: Image): Promise<{ name: string; preview: string }> {
const userMessage: ChatCompletionMessageParam = {
role: 'user',
content: [
{
type: "image_url",
image_url: { url: `data:image/jpeg;base64,${image.base64}` }
},
{
type: "text",
text: `Describe the image ${image.name} concisely. Focus on the main elements and overall composition. Return the result in JSON format with only 'name' and 'preview' properties.`
}
]
};

const response = await openaiService.completion([previewImageSystemMessage, userMessage], 'gpt-4o', false, true) as ChatCompletion;
const result = JSON.parse(response.choices[0].message.content || '{}');
return { name: result.name || image.name, preview: result.preview || '' };
}

async function getImageContext(title: string, article: string, images: Image[]): Promise<{ images: Array<{ name: string, context: string, preview: string }> }> {
const userMessage: ChatCompletionMessageParam = {
role: 'user',
content: `Title: ${title}\n\n${article}`
};

const response = await openaiService.completion([extractImageContextSystemMessage(images), userMessage], 'gpt-4o', false, true) as ChatCompletion;
const result = JSON.parse(response.choices[0].message.content || '{}');

// Generate previews for all images simultaneously
const previewPromises = images.map(image => previewImage(image));
const previews = await Promise.all(previewPromises);

// Merge context and preview information
const mergedResults = result.images.map((contextImage: { name: string, context: string }) => {
const preview = previews.find(p => p.name === contextImage.name);
return {
...contextImage,
preview: preview ? preview.preview : ''
};
});

return { images: mergedResults };
}

// Update the refineDescription function signature
async function refineDescription(image: Image): Promise<Image> {
const userMessage: ChatCompletionMessageParam = {
role: 'user',
content: [
{
type: "image_url",
image_url: { url: `data:image/jpeg;base64,${image.base64}` }
},
{
type: "text",
text: `Write a description of the image ${image.name}. I have some <context>${image.context}</context> that should be useful for understanding the image in a better way. An initial preview of the image is: <preview>${image.preview}</preview>. A good description briefly describes what is on the image, and uses the context to make it more relevant to the article. The purpose of this description is for summarizing the article, so we need just an essence of the image considering the context, not a detailed description of what is on the image.`
}
]
};

console.log(userMessage);

const response = await openaiService.completion([refineDescriptionSystemMessage, userMessage], 'gpt-4o', false) as ChatCompletion;
const result = response.choices[0].message.content || '';
return { ...image, description: result };
}

/**
* Generates a detailed summary by orchestrating all processing steps, including embedding relevant links and images within the content.
*/
async function processAndSummarizeImages(title: string, path: string) {
// Read the article file
const article = await readFile(path, 'utf-8');

// Extract images from the article
const images = await extractImages(article);
console.log('Number of images found:', images.length);

const contexts = await getImageContext(title, article, images);
console.log('Number of image metadata found:', contexts.images.length);

// Process each image: use context and preview from getImageContext, then refine description
const processedImages = await Promise.all(images.map(async (image) => {
const { context = '', preview = '' } = contexts.images.find(ctx => ctx.name === image.name) || {};
return await refineDescription({ ...image, preview, context });
}));

// Prepare and save the summarized images (excluding base64 data)
const describedImages = processedImages.map(({ base64, ...rest }) => rest);
await writeFile(join(__dirname, 'descriptions.json'), JSON.stringify(describedImages, null, 2));

// Prepare and save the final data (only url and description)
const captions = describedImages.map(({ url, description }) => ({ url, description }));
await writeFile(join(__dirname, 'captions.json'), JSON.stringify(captions, null, 2));

// Log completion messages
console.log('Final data saved to final.json');
}

// Execute the main function
processAndSummarizeImages('Lesson #0201 — Audio i interfejs głosowy', join(__dirname, 'article.md'))
.catch(error => console.error('Error while processing and summarizing images:', error));
Loading

0 comments on commit c91f09e

Please sign in to comment.