Skip to content

Commit

Permalink
publish 0.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
vikhyat committed Nov 3, 2024
1 parent 9ce29b1 commit b371fc3
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 22 deletions.
28 changes: 16 additions & 12 deletions clients/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,16 @@ not recommended yet.
Install the library using pip:

```
pip install git+https://github.com/vikhyat/moondream.git#subdirectory=moondream/clients/python
pip install moondream==0.0.1
```

Then download the model weights:

```
wget "https://huggingface.co/vikhyatk/moondream2/resolve/client/moondream-latest-int8.bin.gz?download=true" -O moondream-latest-int8.bin.gz
# int8 weights (recommended):
wget "https://huggingface.co/vikhyatk/moondream2/resolve/client/moondream-latest-int8.bin.gz?download=true" -O - | gunzip > moondream-latest-int8.bin
# ...or, for FP16 weights:
wget "https://huggingface.co/vikhyatk/moondream2/resolve/client/moondream-latest-f16.bin.gz?download=true" -O moondream-latest-f16.bin.gz
```

This downloads gzipped weights, which offers significant bandwidth savings.
You can load gzipped weights directly in the library, but to avoid runtime
decompression (which takes time), we recommend unzipping the weights:

```
gunzip moondream-latest-*.bin.gz
wget "https://huggingface.co/vikhyatk/moondream2/resolve/client/moondream-latest-f16.bin.gz?download=true" -O - | gunzip > moondream-latest-f16.bin
```

## Usage
Expand All @@ -49,7 +42,18 @@ image = Image.open("path/to/image.jpg")
encoded_image = model.encode_image(image)

# Caption the image.
for t in model.caption(encoded_image):
caption = model.caption(encoded_image)

# ...or, if you want to stream the output:
for t in model.caption(encoded_image, stream=True)["caption"]:
print(t, end="", flush=True)

# Ask a question about the image.
question = "How many people are in this image?"
answer = model.answer_question(encoded_image, question)["answer"]

# ...or again, if you want to stream the output:
for t in model.answer_question(encoded_image, question, stream=True)["answer"]:
print(t, end="", flush=True)
```

Expand Down
51 changes: 41 additions & 10 deletions clients/python/moondream/vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ class EncodedImage:
total=False,
)

CaptionOutput = TypedDict(
"CaptionOutput", {"caption": Union[str, Generator[str, None, None]]}
)
QueryOutput = TypedDict(
"QueryOutput", {"answer": Union[str, Generator[str, None, None]]}
)

DEFAULT_MAX_TOKENS = 1024
LATEST_SUPPORTED_VERSION = 0

Expand Down Expand Up @@ -163,8 +170,9 @@ def caption(
self,
image: Union[Image.Image, EncodedImage],
length: str = "normal",
stream: bool = False,
settings: Optional[SamplingSettings] = None,
) -> Union[str, Generator[str, None, None]]:
) -> CaptionOutput:
"""
Generate a caption for the input image.
Expand All @@ -189,15 +197,26 @@ def caption(
max_tokens = settings.get("max_tokens", DEFAULT_MAX_TOKENS)

encoded_image = self.encode_image(image)
for t in self._generate(input_embeds, encoded_image, max_tokens):
yield t

def generator():
for t in self._generate(input_embeds, encoded_image, max_tokens):
yield t

if stream:
return {"caption": generator()}
else:
out = ""
for t in generator():
out += t
return {"caption": out}

def query(
self,
image: Union[Image.Image, EncodedImage],
question: str,
stream: bool = False,
settings: Optional[SamplingSettings] = None,
) -> Union[str, Generator[str, None, None]]:
) -> QueryOutput:
"""
Generate an answer to the input question about the input image.
Expand All @@ -211,18 +230,30 @@ def query(
if "query" not in self.templates:
raise ValueError("Model does not support querying.")

question_toks = self.templates["query"]["prefix"] + self.tokenizer.encode(
f"\n\nQuestion: {question}\n\nAnswer:"
).ids + self.templates["query"]["suffix"]
question_toks = (
self.templates["query"]["prefix"]
+ self.tokenizer.encode(question).ids
+ self.templates["query"]["suffix"]
)

(input_embeds,) = self.text_encoder.run(None, {"input_ids": [question_toks]})
if settings is None:
settings = {}
max_tokens = settings.get("max_tokens", DEFAULT_MAX_TOKENS)

encoded_image = self.encode_image(image) # type: ignore
for t in self._generate(input_embeds, encoded_image, max_tokens):
yield t
encoded_image = self.encode_image(image)

def generator():
for t in self._generate(input_embeds, encoded_image, max_tokens):
yield t

if stream:
return {"answer": generator()}
else:
out = ""
for t in generator():
out += t
return {"answer": out}

def detect(
self, image: Union[Image.Image, EncodedImage], object: str
Expand Down

0 comments on commit b371fc3

Please sign in to comment.