updated libraries, see notes

- updated moondream, requires libvips-42.dll in the project root (see moondream readme for link) - fixed txt_summary not working if no model is loaded first - added /img/codeformer (WIP)
monofy-org · Feb 5, 2025 · 4055a2c · 4055a2c
1 parent af1c469
commit 4055a2c
Show file tree

Hide file tree

Showing 21 changed files with 621 additions and 85 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -67,3 +67,18 @@
 [submodule "submodules/stable_audio_tools"]
 	path = submodules/stable_audio_tools
 	url = https://github.com/monofy-org/stable-audio-tools
+[submodule "submodules/Allegro"]
+	path = submodules/Allegro
+	url = https://github.com/rhymes-ai/Allegro.git
+[submodule "submodules/MMAudio"]
+	path = submodules/MMAudio
+	url = https://github.com/hkchengrex/MMAudio.git
+[submodule "submodules/LatentSync"]
+	path = submodules/LatentSync
+	url = https://github.com/bytedance/LatentSync
+[submodule "submodules/CodeFormer"]
+	path = submodules/CodeFormer
+	url = https://github.com/sczhou/CodeFormer@c5b4593074ba6214284d6acd5f1719b6c5d739af
+[submodule "submodules/BasicSR"]
+	path = submodules/BasicSR
+	url = https://github.com/xinntao/BasicSR
diff --git a/README.md b/README.md
@@ -22,13 +22,13 @@ Your mileage may vary. If you have a lot of CPU RAM, many features will still wo
 
 ## What is included?
 - Large language model using Exllamav2 (Llama 3.1 8b by default, other options available)
-- Stable Diffusion: (SD1.5, SDXL, SD3, Turbo, Lightning, Cascade, IC Relight, and more)
-- Video: Stable Video Diffusion, XT, AnimateLCM with multiple interpolation techniques available
-- Audio: MusicGen, AudioGen, Stable Audio
+- Vision: YOLOS, Moondream, Owl, LLaVA, DepthAnything, Midas, Canny, and more
+- Speech dictation using Whisper
+- Image Generation: (SD1.5, SDXL, SD3, Turbo, Lightning, Cascade, IC Relight, Flux, and more)
+- Video: Stable Video Diffusion XT, LivePortrait, AnimateLCM with multiple modes available
+- Audio: MusicGen, AudioGen, MMAudio
 - Text-to-speech: XTTS with instant voice cloning from 6-20sec samples, edge TTS api also included
 - Canny and depth detection with text-to-image IP adapter support
-- Vision: YOLOS, Moondream, LLaVA
-- Speech dictation using Whisper
 - 3D model generation: Shap-E, TripoSR, LGM Mini
 - Endpoints with combinations of features to automate workflow
 - Easy plugin system that copilot understands (write plugins for new HF models in minutes or seconds)
@@ -58,6 +58,7 @@ Yes! Models and other resources are downloaded automatically. This project aims
 - `/img/depth`
 - `/img/depth/midas`
 - `/img/rembg`
+- `/vid2densepose`
 
 ## Image Generation
 - `/txt2img`
@@ -66,6 +67,7 @@ Yes! Models and other resources are downloaded automatically. This project aims
 - `/txt2img/flux`
 - `/txt2img/canny`
 - `/txt2img/depth`
+- `/txt2img/openpose`
 - `/txt2img/relight`
 - `/txt2img/instantid`
 - `/txt2img/cascade`
@@ -81,7 +83,6 @@ Yes! Models and other resources are downloaded automatically. This project aims
 - `/txt2vid/animate`
 - `/txt2vid/zero`
 - `/txt2vid/zeroscope`
-- `/vid2densepose`
 - `/img2vid/liveportrait`
 
 ## Computer Vision
@@ -93,6 +94,7 @@ Yes! Models and other resources are downloaded automatically. This project aims
 
 ## Audio
 - `/txt2wav/musicgen`
+- `/mmaudio`
 - `/piano2midi`
 
 ## Text Generation
@@ -107,6 +109,9 @@ Yes! Models and other resources are downloaded automatically. This project aims
 - `/youtube/grid`
 - `/youtube/frames`
 
+## Reddit Tools
+- `/reddit/download`
+
 ## Text-to-Speech (TTS)
 - `/tts`
 

diff --git a/classes/requests.py b/classes/requests.py
@@ -74,7 +74,7 @@ class Txt2VidRequest(BaseModel):
     model_index: Optional[int] = TXT2VID_DEFAULT_MODEL_INDEX
     clip_index: Optional[int] = None
     motion_adapter: Optional[Literal["animatediff", "animatelcm"]] = "animatediff"
-    scheduler: Optional[Literal["euler_a", "lcm", "sde", "tcd"]] = "lcm"
+    scheduler: Optional[Literal["euler_a", "lcm", "sde", "tcd", "custom"]] = "lcm"
     use_animatelcm: Optional[bool] = False
     use_lightning: Optional[bool] = False
 

diff --git a/modules/plugins.py b/modules/plugins.py
@@ -45,6 +45,7 @@ def load_plugins():
     from plugins.txt2wav_stable_audio import Txt2WavStableAudioPlugin
     from plugins.img_depth_anything import DepthAnythingPlugin
     from plugins.img_depth_midas import DepthMidasPlugin
+    from plugins.img_codeformer import CodeFormerPlugin
     from plugins.detect_yolos import DetectYOLOSPlugin
     from plugins.detetct_owl import DetectOwlPlugin
     from plugins.img2model_lgm import Img2ModelLGMPlugin
@@ -136,6 +137,7 @@ def load_plugins():
     register_plugin(Img2TxtLlavaPlugin, quiet)
     register_plugin(Img2TxtMoondreamPlugin, quiet)
     register_plugin(RembgPlugin, quiet)
+    register_plugin(CodeFormerPlugin, quiet)
     register_plugin(ImgUpresPlugin, quiet)
     register_plugin(Txt2WavMusicGenPlugin, quiet)
     register_plugin(ExllamaV2Plugin, quiet)

diff --git a/plugins/extras/youtube.py b/plugins/extras/youtube.py
@@ -78,7 +78,7 @@ def download_media(
     from pytubefix import YouTube
     from moviepy.editor import VideoFileClip
 
-    yt: YouTube = YouTube(url)
+    yt: YouTube = YouTube(url, 'WEB')
 
     # extract start time from url
     start_time_seconds = 0

diff --git a/plugins/img2txt_moondream.py b/plugins/img2txt_moondream.py
@@ -1,10 +1,13 @@
 import logging
 import math
 from typing import Optional
+
+import torch
 from fastapi import Depends, HTTPException
-from PIL import Image
 from fastapi.responses import JSONResponse
+from PIL import Image
 from pydantic import BaseModel
+
 from modules.plugins import PluginBase, release_plugin, use_plugin
 from utils.gpu_utils import autodetect_device, autodetect_dtype, set_seed
 from utils.image_utils import get_image_from_request
@@ -21,15 +24,16 @@ class VisionRequest(BaseModel):
 
 
 class Img2TxtMoondreamPlugin(PluginBase):
-
     name = "Vision (vikhyatk/moondream2)"
     description = "Image-to-text using Moondream."
     device = autodetect_device()
     dtype = autodetect_dtype(False)
     instance = None
 
-    def __init__(self):        
-        from transformers import AutoTokenizer, AutoModelForCausalLM
+    def __init__(self):
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        from submodules.moondream.moondream.torch.moondream import MoondreamModel
 
         model_id = "vikhyatk/moondream2"
 
@@ -40,7 +44,7 @@ def __init__(self):
             model_id,
             trust_remote_code=True,
         ).to(
-            device=Img2TxtMoondreamPlugin.device,
+            device=self.device,
             dtype=self.dtype,
         )
         moondream.eval()
@@ -53,18 +57,21 @@ def __init__(self):
         }
 
     async def generate_response(self, image: Image.Image, prompt: str, seed: int = -1):
-        from submodules.moondream.moondream import Moondream
-        from transformers import (
-            CodeGenTokenizerFast as Tokenizer,
-        )
+        from transformers import CodeGenTokenizerFast as Tokenizer
+
+        from submodules.moondream.moondream.torch.moondream import MoondreamModel
 
-        moondream: Moondream = self.resources["moondream"]
-        tokenizer: Tokenizer = self.resources["tokenizer"]
+        moondream: MoondreamModel = self.resources["moondream"]
         seed = set_seed(seed)
-        print("Encoding image...")
-        image_embeds = moondream.encode_image(image)
+
         print("Getting response...")
-        return moondream.answer_question(image_embeds, prompt, tokenizer)
+        response = moondream.query(image, prompt, False)
+        answer = response.get("answer", "").strip()
+
+        if not answer:
+            raise HTTPException(status_code=500, detail="No response")
+
+        return answer
 
 
 @PluginBase.router.post("/vision", response_class=JSONResponse)
@@ -98,8 +105,9 @@ async def vision(req: VisionRequest):
                     )
                 )
 
-        response = await plugin.generate_response(img, req.prompt)
-        return JSONResponse({"response": response})
+        answer = await plugin.generate_response(img, req.prompt)
+
+        return JSONResponse({"response": answer})
     except Exception as e:
         logging.error(e, exc_info=True)
         raise HTTPException(status_code=500, detail=str(e))

diff --git a/plugins/img2vid_xt.py b/plugins/img2vid_xt.py
@@ -1,27 +1,29 @@
 import logging
 import os
+from typing import Optional
 from urllib.parse import urlparse
+
 import huggingface_hub
-from typing import Optional
-from PIL import Image
+import torch
 from fastapi import BackgroundTasks, Depends
 from fastapi.responses import FileResponse
+from PIL import Image
 from pydantic import BaseModel
-import torch
-from classes.animatelcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler
+
 from classes.animatelcm_pipeline import StableVideoDiffusionPipeline
-from modules.plugins import PluginBase, use_plugin, release_plugin
+from classes.animatelcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler
+from modules.plugins import PluginBase, release_plugin, use_plugin
 from plugins.video_plugin import VideoPlugin
-from utils.console_logging import log_generate, log_loading
-from utils.gpu_utils import set_seed
-from utils.image_utils import crop_and_resize, get_image_from_request
 from settings import (
+    HYPERTILE_VIDEO,
     IMG2VID_DECODE_CHUNK_SIZE,
-    IMG2VID_MAX_FRAMES,
     IMG2VID_DEFAULT_MOTION_BUCKET,
-    HYPERTILE_VIDEO,
+    IMG2VID_MAX_FRAMES,
     SVD_MODEL,
 )
+from utils.console_logging import log_generate, log_loading
+from utils.gpu_utils import set_seed
+from utils.image_utils import crop_and_resize, get_image_from_request
 from utils.video_utils import get_video_from_request
 
 
@@ -101,7 +103,7 @@ def load_weights(self, file_path):
         log_loading("weights", os.path.basename(file_path))
         from safetensors.torch import load_file
 
-        pipe.unet.load_state_dict(load_file(file_path, device="cuda:0"), strict=False)
+        pipe.unet.load_state_dict(load_file(file_path, device="cpu"), strict=False)
 
 
 def is_source_movie(url: str):
@@ -180,7 +182,9 @@ async def gen():
             log_generate(f"Generating video ({req.width}x{req.height})")
 
             pipe.enable_model_cpu_offload(None, plugin.device)
-            pipe.enable_sequential_cpu_offload(None, plugin.device)
+
+            if (width * height > 576 * 576):
+                pipe.enable_sequential_cpu_offload(None, plugin.device)
 
             with torch.autocast("cuda"):
                 frames = pipe(