Skip to content

Commit

Permalink
updated libraries, see notes
Browse files Browse the repository at this point in the history
- updated moondream, requires libvips-42.dll in the project root (see moondream readme for link)
- fixed txt_summary not working if no model is loaded first
- added /img/codeformer (WIP)
  • Loading branch information
JohnnyStreet committed Feb 5, 2025
1 parent af1c469 commit 4055a2c
Show file tree
Hide file tree
Showing 21 changed files with 621 additions and 85 deletions.
15 changes: 15 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,18 @@
[submodule "submodules/stable_audio_tools"]
path = submodules/stable_audio_tools
url = https://github.com/monofy-org/stable-audio-tools
[submodule "submodules/Allegro"]
path = submodules/Allegro
url = https://github.com/rhymes-ai/Allegro.git
[submodule "submodules/MMAudio"]
path = submodules/MMAudio
url = https://github.com/hkchengrex/MMAudio.git
[submodule "submodules/LatentSync"]
path = submodules/LatentSync
url = https://github.com/bytedance/LatentSync
[submodule "submodules/CodeFormer"]
path = submodules/CodeFormer
url = https://github.com/sczhou/CodeFormer@c5b4593074ba6214284d6acd5f1719b6c5d739af
[submodule "submodules/BasicSR"]
path = submodules/BasicSR
url = https://github.com/xinntao/BasicSR
17 changes: 11 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ Your mileage may vary. If you have a lot of CPU RAM, many features will still wo

## What is included?
- Large language model using Exllamav2 (Llama 3.1 8b by default, other options available)
- Stable Diffusion: (SD1.5, SDXL, SD3, Turbo, Lightning, Cascade, IC Relight, and more)
- Video: Stable Video Diffusion, XT, AnimateLCM with multiple interpolation techniques available
- Audio: MusicGen, AudioGen, Stable Audio
- Vision: YOLOS, Moondream, Owl, LLaVA, DepthAnything, Midas, Canny, and more
- Speech dictation using Whisper
- Image Generation: (SD1.5, SDXL, SD3, Turbo, Lightning, Cascade, IC Relight, Flux, and more)
- Video: Stable Video Diffusion XT, LivePortrait, AnimateLCM with multiple modes available
- Audio: MusicGen, AudioGen, MMAudio
- Text-to-speech: XTTS with instant voice cloning from 6-20sec samples, edge TTS api also included
- Canny and depth detection with text-to-image IP adapter support
- Vision: YOLOS, Moondream, LLaVA
- Speech dictation using Whisper
- 3D model generation: Shap-E, TripoSR, LGM Mini
- Endpoints with combinations of features to automate workflow
- Easy plugin system that copilot understands (write plugins for new HF models in minutes or seconds)
Expand Down Expand Up @@ -58,6 +58,7 @@ Yes! Models and other resources are downloaded automatically. This project aims
- `/img/depth`
- `/img/depth/midas`
- `/img/rembg`
- `/vid2densepose`

## Image Generation
- `/txt2img`
Expand All @@ -66,6 +67,7 @@ Yes! Models and other resources are downloaded automatically. This project aims
- `/txt2img/flux`
- `/txt2img/canny`
- `/txt2img/depth`
- `/txt2img/openpose`
- `/txt2img/relight`
- `/txt2img/instantid`
- `/txt2img/cascade`
Expand All @@ -81,7 +83,6 @@ Yes! Models and other resources are downloaded automatically. This project aims
- `/txt2vid/animate`
- `/txt2vid/zero`
- `/txt2vid/zeroscope`
- `/vid2densepose`
- `/img2vid/liveportrait`

## Computer Vision
Expand All @@ -93,6 +94,7 @@ Yes! Models and other resources are downloaded automatically. This project aims

## Audio
- `/txt2wav/musicgen`
- `/mmaudio`
- `/piano2midi`

## Text Generation
Expand All @@ -107,6 +109,9 @@ Yes! Models and other resources are downloaded automatically. This project aims
- `/youtube/grid`
- `/youtube/frames`

## Reddit Tools
- `/reddit/download`

## Text-to-Speech (TTS)
- `/tts`

Expand Down
2 changes: 1 addition & 1 deletion classes/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class Txt2VidRequest(BaseModel):
model_index: Optional[int] = TXT2VID_DEFAULT_MODEL_INDEX
clip_index: Optional[int] = None
motion_adapter: Optional[Literal["animatediff", "animatelcm"]] = "animatediff"
scheduler: Optional[Literal["euler_a", "lcm", "sde", "tcd"]] = "lcm"
scheduler: Optional[Literal["euler_a", "lcm", "sde", "tcd", "custom"]] = "lcm"
use_animatelcm: Optional[bool] = False
use_lightning: Optional[bool] = False

Expand Down
2 changes: 2 additions & 0 deletions modules/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def load_plugins():
from plugins.txt2wav_stable_audio import Txt2WavStableAudioPlugin
from plugins.img_depth_anything import DepthAnythingPlugin
from plugins.img_depth_midas import DepthMidasPlugin
from plugins.img_codeformer import CodeFormerPlugin
from plugins.detect_yolos import DetectYOLOSPlugin
from plugins.detetct_owl import DetectOwlPlugin
from plugins.img2model_lgm import Img2ModelLGMPlugin
Expand Down Expand Up @@ -136,6 +137,7 @@ def load_plugins():
register_plugin(Img2TxtLlavaPlugin, quiet)
register_plugin(Img2TxtMoondreamPlugin, quiet)
register_plugin(RembgPlugin, quiet)
register_plugin(CodeFormerPlugin, quiet)
register_plugin(ImgUpresPlugin, quiet)
register_plugin(Txt2WavMusicGenPlugin, quiet)
register_plugin(ExllamaV2Plugin, quiet)
Expand Down
2 changes: 1 addition & 1 deletion plugins/extras/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def download_media(
from pytubefix import YouTube
from moviepy.editor import VideoFileClip

yt: YouTube = YouTube(url)
yt: YouTube = YouTube(url, 'WEB')

# extract start time from url
start_time_seconds = 0
Expand Down
40 changes: 24 additions & 16 deletions plugins/img2txt_moondream.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import logging
import math
from typing import Optional

import torch
from fastapi import Depends, HTTPException
from PIL import Image
from fastapi.responses import JSONResponse
from PIL import Image
from pydantic import BaseModel

from modules.plugins import PluginBase, release_plugin, use_plugin
from utils.gpu_utils import autodetect_device, autodetect_dtype, set_seed
from utils.image_utils import get_image_from_request
Expand All @@ -21,15 +24,16 @@ class VisionRequest(BaseModel):


class Img2TxtMoondreamPlugin(PluginBase):

name = "Vision (vikhyatk/moondream2)"
description = "Image-to-text using Moondream."
device = autodetect_device()
dtype = autodetect_dtype(False)
instance = None

def __init__(self):
from transformers import AutoTokenizer, AutoModelForCausalLM
def __init__(self):
from transformers import AutoModelForCausalLM, AutoTokenizer

from submodules.moondream.moondream.torch.moondream import MoondreamModel

model_id = "vikhyatk/moondream2"

Expand All @@ -40,7 +44,7 @@ def __init__(self):
model_id,
trust_remote_code=True,
).to(
device=Img2TxtMoondreamPlugin.device,
device=self.device,
dtype=self.dtype,
)
moondream.eval()
Expand All @@ -53,18 +57,21 @@ def __init__(self):
}

async def generate_response(self, image: Image.Image, prompt: str, seed: int = -1):
from submodules.moondream.moondream import Moondream
from transformers import (
CodeGenTokenizerFast as Tokenizer,
)
from transformers import CodeGenTokenizerFast as Tokenizer

from submodules.moondream.moondream.torch.moondream import MoondreamModel

moondream: Moondream = self.resources["moondream"]
tokenizer: Tokenizer = self.resources["tokenizer"]
moondream: MoondreamModel = self.resources["moondream"]
seed = set_seed(seed)
print("Encoding image...")
image_embeds = moondream.encode_image(image)

print("Getting response...")
return moondream.answer_question(image_embeds, prompt, tokenizer)
response = moondream.query(image, prompt, False)
answer = response.get("answer", "").strip()

if not answer:
raise HTTPException(status_code=500, detail="No response")

return answer


@PluginBase.router.post("/vision", response_class=JSONResponse)
Expand Down Expand Up @@ -98,8 +105,9 @@ async def vision(req: VisionRequest):
)
)

response = await plugin.generate_response(img, req.prompt)
return JSONResponse({"response": response})
answer = await plugin.generate_response(img, req.prompt)

return JSONResponse({"response": answer})
except Exception as e:
logging.error(e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
Expand Down
28 changes: 16 additions & 12 deletions plugins/img2vid_xt.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
import logging
import os
from typing import Optional
from urllib.parse import urlparse

import huggingface_hub
from typing import Optional
from PIL import Image
import torch
from fastapi import BackgroundTasks, Depends
from fastapi.responses import FileResponse
from PIL import Image
from pydantic import BaseModel
import torch
from classes.animatelcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler

from classes.animatelcm_pipeline import StableVideoDiffusionPipeline
from modules.plugins import PluginBase, use_plugin, release_plugin
from classes.animatelcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler
from modules.plugins import PluginBase, release_plugin, use_plugin
from plugins.video_plugin import VideoPlugin
from utils.console_logging import log_generate, log_loading
from utils.gpu_utils import set_seed
from utils.image_utils import crop_and_resize, get_image_from_request
from settings import (
HYPERTILE_VIDEO,
IMG2VID_DECODE_CHUNK_SIZE,
IMG2VID_MAX_FRAMES,
IMG2VID_DEFAULT_MOTION_BUCKET,
HYPERTILE_VIDEO,
IMG2VID_MAX_FRAMES,
SVD_MODEL,
)
from utils.console_logging import log_generate, log_loading
from utils.gpu_utils import set_seed
from utils.image_utils import crop_and_resize, get_image_from_request
from utils.video_utils import get_video_from_request


Expand Down Expand Up @@ -101,7 +103,7 @@ def load_weights(self, file_path):
log_loading("weights", os.path.basename(file_path))
from safetensors.torch import load_file

pipe.unet.load_state_dict(load_file(file_path, device="cuda:0"), strict=False)
pipe.unet.load_state_dict(load_file(file_path, device="cpu"), strict=False)


def is_source_movie(url: str):
Expand Down Expand Up @@ -180,7 +182,9 @@ async def gen():
log_generate(f"Generating video ({req.width}x{req.height})")

pipe.enable_model_cpu_offload(None, plugin.device)
pipe.enable_sequential_cpu_offload(None, plugin.device)

if (width * height > 576 * 576):
pipe.enable_sequential_cpu_offload(None, plugin.device)

with torch.autocast("cuda"):
frames = pipe(
Expand Down
Loading

0 comments on commit 4055a2c

Please sign in to comment.