Added captions example

rafalszemraj · Nov 14, 2024 · c91f09e · c91f09e
1 parent 5fb5929
commit c91f09e
Show file tree

Hide file tree

Showing 6 changed files with 609 additions and 0 deletions.
diff --git a/captions/OpenAIService.ts b/captions/OpenAIService.ts
@@ -0,0 +1,111 @@
+import OpenAI from "openai";
+import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
+import { createByModelName } from '@microsoft/tiktokenizer';
+
+export class OpenAIService {
+  private openai: OpenAI;
+  private tokenizers: Map<string, Awaited<ReturnType<typeof createByModelName>>> = new Map();
+  private readonly IM_START = "<|im_start|>";
+  private readonly IM_END = "<|im_end|>";
+  private readonly IM_SEP = "<|im_sep|>";
+
+  constructor() {
+    this.openai = new OpenAI();
+  }
+
+  private async getTokenizer(modelName: string) {
+    if (!this.tokenizers.has(modelName)) {
+      const specialTokens: ReadonlyMap<string, number> = new Map([
+        [this.IM_START, 100264],
+        [this.IM_END, 100265],
+        [this.IM_SEP, 100266],
+      ]);
+      const tokenizer = await createByModelName(modelName, specialTokens);
+      this.tokenizers.set(modelName, tokenizer);
+    }
+    return this.tokenizers.get(modelName)!;
+  }
+
+  async countTokens(messages: ChatCompletionMessageParam[], model: string = 'gpt-4o'): Promise<number> {
+    const tokenizer = await this.getTokenizer(model);
+
+    let formattedContent = '';
+    messages.forEach((message) => {
+      formattedContent += `${this.IM_START}${message.role}${this.IM_SEP}${message.content || ''}${this.IM_END}`;
+    });
+    formattedContent += `${this.IM_START}assistant${this.IM_SEP}`;
+
+    const tokens = tokenizer.encode(formattedContent, [this.IM_START, this.IM_END, this.IM_SEP]);
+    return tokens.length;
+  }
+
+  async completion(
+    messages: ChatCompletionMessageParam[],
+    model: string = "gpt-4o",
+    stream: boolean = false,
+    jsonMode: boolean = false,
+    maxTokens: number = 4096
+  ): Promise<OpenAI.Chat.Completions.ChatCompletion | AsyncIterable<OpenAI.Chat.Completions.ChatCompletionChunk>> {
+    try {
+      const chatCompletion = await this.openai.chat.completions.create({
+        messages,
+        model,
+        ...(model !== 'o1-mini' && model !== 'o1-preview' && {
+          stream,
+          max_tokens: maxTokens,
+          response_format: jsonMode ? { type: "json_object" } : { type: "text" }
+        })
+      });
+
+      if (stream) {
+        return chatCompletion as AsyncIterable<OpenAI.Chat.Completions.ChatCompletionChunk>;
+      } else {
+        return chatCompletion as OpenAI.Chat.Completions.ChatCompletion;
+      }
+    } catch (error) {
+      console.error("Error in OpenAI completion:", error);
+      throw error;
+    }
+  }
+
+  async calculateImageTokens(width: number, height: number, detail: 'low' | 'high'): Promise<number> {
+    let tokenCost = 0;
+
+    if (detail === 'low') {
+      tokenCost += 85;
+      return tokenCost;
+    }
+
+    const MAX_DIMENSION = 2048;
+    const SCALE_SIZE = 768;
+
+    // Resize to fit within MAX_DIMENSION x MAX_DIMENSION
+    if (width > MAX_DIMENSION || height > MAX_DIMENSION) {
+      const aspectRatio = width / height;
+      if (aspectRatio > 1) {
+        width = MAX_DIMENSION;
+        height = Math.round(MAX_DIMENSION / aspectRatio);
+      } else {
+        height = MAX_DIMENSION;
+        width = Math.round(MAX_DIMENSION * aspectRatio);
+      }
+    }
+
+    // Scale the shortest side to SCALE_SIZE
+    if (width >= height && height > SCALE_SIZE) {
+      width = Math.round((SCALE_SIZE / height) * width);
+      height = SCALE_SIZE;
+    } else if (height > width && width > SCALE_SIZE) {
+      height = Math.round((SCALE_SIZE / width) * height);
+      width = SCALE_SIZE;
+    }
+
+    // Calculate the number of 512px squares
+    const numSquares = Math.ceil(width / 512) * Math.ceil(height / 512);
+
+    // Calculate the token cost
+    tokenCost += (numSquares * 170) + 85;
+
+    return tokenCost;
+  }
+}
diff --git a/captions/app.ts b/captions/app.ts
@@ -0,0 +1,156 @@
+import type { ChatCompletion, ChatCompletionMessageParam } from "openai/resources/chat/completions";
+import { OpenAIService } from './OpenAIService';
+import { readFile, writeFile } from 'fs/promises';
+import { join } from 'path';
+import { extractImageContextSystemMessage, refineDescriptionSystemMessage, previewImageSystemMessage } from './prompts';
+
+const openaiService = new OpenAIService();
+
+// Update the type definition for Image
+export type Image = {
+    alt: string;
+    url: string;
+    context: string;
+    description: string;
+    preview: string;
+    base64: string;
+    name: string;
+};
+
+
+async function extractImages(article: string): Promise<Image[]> {
+    const imageRegex = /!\[([^\]]*)\]\(([^)]+)\)/g;
+    const matches = [...article.matchAll(imageRegex)];
+
+    const imagePromises = matches.map(async ([, alt, url]) => {
+        try {
+            const name = url.split('/').pop() || '';
+            const response = await fetch(url);
+            if (!response.ok) throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
+            const arrayBuffer = await response.arrayBuffer();
+            const base64 = Buffer.from(arrayBuffer).toString('base64');
+
+            return {
+                alt,
+                url,
+                context: '',
+                description: '',
+                preview: '',
+                base64,
+                name
+            };
+        } catch (error) {
+            console.error(`Error processing image ${url}:`, error);
+            return null;
+        }
+    });
+
+    const results = await Promise.all(imagePromises);
+    return results.filter((link): link is Image => link !== null);
+}
+
+
+// Update the previewImage function signature
+async function previewImage(image: Image): Promise<{ name: string; preview: string }> {
+    const userMessage: ChatCompletionMessageParam = {
+        role: 'user',
+        content: [
+            {
+                type: "image_url",
+                image_url: { url: `data:image/jpeg;base64,${image.base64}` }
+            },
+            {
+                type: "text",
+                text: `Describe the image ${image.name} concisely. Focus on the main elements and overall composition. Return the result in JSON format with only 'name' and 'preview' properties.`
+            }
+        ]
+    };
+
+    const response = await openaiService.completion([previewImageSystemMessage, userMessage], 'gpt-4o', false, true) as ChatCompletion;
+    const result = JSON.parse(response.choices[0].message.content || '{}');
+    return { name: result.name || image.name, preview: result.preview || '' };
+}
+
+async function getImageContext(title: string, article: string, images: Image[]): Promise<{ images: Array<{ name: string, context: string, preview: string }> }> {
+    const userMessage: ChatCompletionMessageParam = {
+        role: 'user',
+        content: `Title: ${title}\n\n${article}`
+    };
+
+    const response = await openaiService.completion([extractImageContextSystemMessage(images), userMessage], 'gpt-4o', false, true) as ChatCompletion;
+    const result = JSON.parse(response.choices[0].message.content || '{}');
+
+    // Generate previews for all images simultaneously
+    const previewPromises = images.map(image => previewImage(image));
+    const previews = await Promise.all(previewPromises);
+
+    // Merge context and preview information
+    const mergedResults = result.images.map((contextImage: { name: string, context: string }) => {
+        const preview = previews.find(p => p.name === contextImage.name);
+        return {
+            ...contextImage,
+            preview: preview ? preview.preview : ''
+        };
+    });
+
+    return { images: mergedResults };
+}
+
+// Update the refineDescription function signature
+async function refineDescription(image: Image): Promise<Image> {
+    const userMessage: ChatCompletionMessageParam = {
+        role: 'user',
+        content: [
+            {
+                type: "image_url",
+                image_url: { url: `data:image/jpeg;base64,${image.base64}` }
+            },
+            {
+                type: "text",
+                text: `Write a description of the image ${image.name}. I have some <context>${image.context}</context> that should be useful for understanding the image in a better way. An initial preview of the image is: <preview>${image.preview}</preview>. A good description briefly describes what is on the image, and uses the context to make it more relevant to the article. The purpose of this description is for summarizing the article, so we need just an essence of the image considering the context, not a detailed description of what is on the image.`
+            }
+        ]
+    };
+
+    console.log(userMessage);
+
+    const response = await openaiService.completion([refineDescriptionSystemMessage, userMessage], 'gpt-4o', false) as ChatCompletion;
+    const result = response.choices[0].message.content || '';
+    return { ...image, description: result };
+}
+
+/**
+ * Generates a detailed summary by orchestrating all processing steps, including embedding relevant links and images within the content.
+ */
+async function processAndSummarizeImages(title: string, path: string) {
+    // Read the article file
+    const article = await readFile(path, 'utf-8');
+
+    // Extract images from the article
+    const images = await extractImages(article);
+    console.log('Number of images found:', images.length);
+
+    const contexts = await getImageContext(title, article, images);
+    console.log('Number of image metadata found:', contexts.images.length);
+
+    // Process each image: use context and preview from getImageContext, then refine description
+    const processedImages = await Promise.all(images.map(async (image) => {
+        const { context = '', preview = '' } = contexts.images.find(ctx => ctx.name === image.name) || {};
+        return await refineDescription({ ...image, preview, context });
+    }));
+
+    // Prepare and save the summarized images (excluding base64 data)
+    const describedImages = processedImages.map(({ base64, ...rest }) => rest);
+    await writeFile(join(__dirname, 'descriptions.json'), JSON.stringify(describedImages, null, 2));
+
+    // Prepare and save the final data (only url and description)
+    const captions = describedImages.map(({ url, description }) => ({ url, description }));
+    await writeFile(join(__dirname, 'captions.json'), JSON.stringify(captions, null, 2));
+
+    // Log completion messages
+    console.log('Final data saved to final.json');
+}
+
+// Execute the main function
+processAndSummarizeImages('Lesson #0201 — Audio i interfejs głosowy', join(__dirname, 'article.md'))
+    .catch(error => console.error('Error while processing and summarizing images:', error));