publish 0.0.1

kaschifs · Nov 3, 2024 · b371fc3 · b371fc3
1 parent 9ce29b1
commit b371fc3
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 22 deletions.
diff --git a/clients/python/README.md b/clients/python/README.md
@@ -15,23 +15,16 @@ not recommended yet.
 Install the library using pip:
 
 ```
-pip install git+https://github.com/vikhyat/moondream.git#subdirectory=moondream/clients/python
+pip install moondream==0.0.1
 ```
 
 Then download the model weights:
 
 ```
-wget "https://huggingface.co/vikhyatk/moondream2/resolve/client/moondream-latest-int8.bin.gz?download=true" -O moondream-latest-int8.bin.gz
+# int8 weights (recommended):
+wget "https://huggingface.co/vikhyatk/moondream2/resolve/client/moondream-latest-int8.bin.gz?download=true" -O - | gunzip > moondream-latest-int8.bin
 # ...or, for FP16 weights:
-wget "https://huggingface.co/vikhyatk/moondream2/resolve/client/moondream-latest-f16.bin.gz?download=true" -O moondream-latest-f16.bin.gz
-```
-
-This downloads gzipped weights, which offers significant bandwidth savings.
-You can load gzipped weights directly in the library, but to avoid runtime
-decompression (which takes time), we recommend unzipping the weights:
-
-```
-gunzip moondream-latest-*.bin.gz
+wget "https://huggingface.co/vikhyatk/moondream2/resolve/client/moondream-latest-f16.bin.gz?download=true" -O - | gunzip > moondream-latest-f16.bin
 ```
 
 ## Usage
@@ -49,7 +42,18 @@ image = Image.open("path/to/image.jpg")
 encoded_image = model.encode_image(image)
 
 # Caption the image.
-for t in model.caption(encoded_image):
+caption = model.caption(encoded_image)
+
+# ...or, if you want to stream the output:
+for t in model.caption(encoded_image, stream=True)["caption"]:
+    print(t, end="", flush=True)
+
+# Ask a question about the image.
+question = "How many people are in this image?"
+answer = model.answer_question(encoded_image, question)["answer"]
+
+# ...or again, if you want to stream the output:
+for t in model.answer_question(encoded_image, question, stream=True)["answer"]:
     print(t, end="", flush=True)
 ```
 

diff --git a/clients/python/moondream/vl.py b/clients/python/moondream/vl.py
@@ -25,6 +25,13 @@ class EncodedImage:
     total=False,
 )
 
+CaptionOutput = TypedDict(
+    "CaptionOutput", {"caption": Union[str, Generator[str, None, None]]}
+)
+QueryOutput = TypedDict(
+    "QueryOutput", {"answer": Union[str, Generator[str, None, None]]}
+)
+
 DEFAULT_MAX_TOKENS = 1024
 LATEST_SUPPORTED_VERSION = 0
 
@@ -163,8 +170,9 @@ def caption(
         self,
         image: Union[Image.Image, EncodedImage],
         length: str = "normal",
+        stream: bool = False,
         settings: Optional[SamplingSettings] = None,
-    ) -> Union[str, Generator[str, None, None]]:
+    ) -> CaptionOutput:
         """
         Generate a caption for the input image.
 
@@ -189,15 +197,26 @@ def caption(
         max_tokens = settings.get("max_tokens", DEFAULT_MAX_TOKENS)
 
         encoded_image = self.encode_image(image)
-        for t in self._generate(input_embeds, encoded_image, max_tokens):
-            yield t
+
+        def generator():
+            for t in self._generate(input_embeds, encoded_image, max_tokens):
+                yield t
+
+        if stream:
+            return {"caption": generator()}
+        else:
+            out = ""
+            for t in generator():
+                out += t
+            return {"caption": out}
 
     def query(
         self,
         image: Union[Image.Image, EncodedImage],
         question: str,
+        stream: bool = False,
         settings: Optional[SamplingSettings] = None,
-    ) -> Union[str, Generator[str, None, None]]:
+    ) -> QueryOutput:
         """
         Generate an answer to the input question about the input image.
 
@@ -211,18 +230,30 @@ def query(
         if "query" not in self.templates:
             raise ValueError("Model does not support querying.")
 
-        question_toks = self.templates["query"]["prefix"] + self.tokenizer.encode(
-            f"\n\nQuestion: {question}\n\nAnswer:"
-        ).ids + self.templates["query"]["suffix"]
+        question_toks = (
+            self.templates["query"]["prefix"]
+            + self.tokenizer.encode(question).ids
+            + self.templates["query"]["suffix"]
+        )
 
         (input_embeds,) = self.text_encoder.run(None, {"input_ids": [question_toks]})
         if settings is None:
             settings = {}
         max_tokens = settings.get("max_tokens", DEFAULT_MAX_TOKENS)
 
-        encoded_image = self.encode_image(image)  # type: ignore
-        for t in self._generate(input_embeds, encoded_image, max_tokens):
-            yield t
+        encoded_image = self.encode_image(image)
+
+        def generator():
+            for t in self._generate(input_embeds, encoded_image, max_tokens):
+                yield t
+
+        if stream:
+            return {"answer": generator()}
+        else:
+            out = ""
+            for t in generator():
+                out += t
+            return {"answer": out}
 
     def detect(
         self, image: Union[Image.Image, EncodedImage], object: str