Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
dmitrymailk committed Jan 15, 2025
2 parents fe12684 + d9fbcdc commit a059469
Show file tree
Hide file tree
Showing 66 changed files with 767 additions and 315 deletions.
129 changes: 69 additions & 60 deletions diffusers/examples/instruct_pix2pix/inference_pix2pix.ipynb

Large diffs are not rendered by default.

121 changes: 81 additions & 40 deletions diffusers/examples/instruct_pix2pix/nfs_pix2pix_dataset.ipynb

Large diffs are not rendered by default.

200 changes: 0 additions & 200 deletions img2img-turbo/inference_pix2pix.ipynb

This file was deleted.

3 changes: 2 additions & 1 deletion img2img-turbo/.gitignore → img2img_turbo/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,5 @@ outputs/
outputs/bird.png
data
wandb
output/
output/
models
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
363 changes: 363 additions & 0 deletions img2img_turbo/inference_pix2pix.ipynb

Large diffs are not rendered by default.

File renamed without changes.
1 change: 1 addition & 0 deletions img2img_turbo/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cd .. && python -m img2img_turbo.src.benchmark
File renamed without changes.
143 changes: 143 additions & 0 deletions img2img_turbo/src/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import argparse
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
import torchvision.transforms.functional as F
from img2img_turbo.src.pix2pix_turbo import Pix2Pix_Turbo
from img2img_turbo.src.image_prep import canny_from_pil
import argparse
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
import torchvision.transforms.functional as F
from img2img_turbo.src.pix2pix_turbo import Pix2Pix_Turbo
from img2img_turbo.src.image_prep import canny_from_pil
import time

torch._inductor.config.conv_1x1_as_mm = True
torch._inductor.config.coordinate_descent_tuning = True
torch._inductor.config.epilogue_fusion = False
torch._inductor.config.coordinate_descent_check_all_directions = True


def merge_loras(model):
model_modules = dict(model.named_modules())
for module_key in model_modules:
if "base_layer" in module_key:
parent_module = ".".join(module_key.split(".")[:-1])
prev_parent_module = ".".join(module_key.split(".")[:-2])
if hasattr(model_modules[parent_module], "base_layer"):
model_modules[parent_module].merge()
setattr(
model_modules[prev_parent_module],
parent_module.split(".")[-1],
model_modules[module_key],
)


def single_image(model, dataset, T, prompt):

input_image = dataset[190]["input_image"].convert("RGB")
i_t = T(input_image)
c_t = F.to_tensor(i_t).unsqueeze(0).cuda()
# c_t = c_t.half()
c_t = c_t.to(torch.bfloat16)

start = time.time()
with torch.no_grad():
# output_image = model(c_t, prompt)
output_image = model.custom_forward(c_t, prompt)

# output_pil = transforms.ToPILImage()(output_image[0].cpu() * 0.5 + 0.5)
print("single image", time.time() - start)


def multiple_images(model, dataset, T, prompt):

input_image = dataset[190]["input_image"].convert("RGB")
total_images = 140
images = [
dataset[190 + i]["input_image"].convert("RGB") for i in range(total_images)
]
images = [
F.to_tensor(T(item)).unsqueeze(0).cuda().to(torch.bfloat16) for item in images
]

start = time.time()
for input_image in images:
with torch.no_grad():
# i_t = T(input_image)
# c_t = F.to_tensor(i_t).unsqueeze(0).cuda()
# c_t = c_t.half()
# output_image = model(c_t, prompt)
# output_image = model.custom_forward(c_t, prompt)
output_image = model.custom_forward(input_image, prompt)

# output_pil = transforms.ToPILImage()(output_image[0].cpu() * 0.5 + 0.5)
full_time = time.time() - start
print("multiple_images", full_time)
print("multiple_images fps", 1 / (full_time / 140))


if __name__ == "__main__":
from datasets import load_dataset

dataset_name = "dim/nfs_pix2pix_1920_1080_v5"
# dataset_name = "dim/nfs_pix2pix_1920_1080_v6"
dataset = load_dataset(dataset_name, num_proc=4)
dataset = dataset["train"]

model_name = ""
model_path = "/code/img2img_turbo/models/model_20001.pkl"
use_fp16 = not False

# initialize the model
model = Pix2Pix_Turbo(pretrained_name=model_name, pretrained_path=model_path)
merge_loras(model=model)
model.set_eval()
# if use_fp16:
model.to(torch.bfloat16)
model.unet.to(torch.bfloat16)
model.vae.to(torch.bfloat16)
model.unet.fuse_qkv_projections()
# model.timesteps = 1
# model.unet.to(memory_format=torch.channels_last)
# model.vae.to(memory_format=torch.channels_last)
# model.unet = torch.compile(model.unet, mode="reduce-overhead", fullgraph=not True)
# model.vae.config.force_upcast = False
# model.vae.decode = torch.compile(
# model.vae.decode, mode="reduce-overhead", fullgraph=not True
# )

T = transforms.Compose(
[
transforms.Resize(512, interpolation=transforms.InterpolationMode.LANCZOS),
transforms.CenterCrop(512),
]
)
prompt = dataset[0]["edit_prompt"]

single_image(model, dataset, T, prompt)
single_image(model, dataset, T, prompt)
single_image(model, dataset, T, prompt)
single_image(model, dataset, T, prompt)
multiple_images(model, dataset, T, prompt)
multiple_images(model, dataset, T, prompt)
multiple_images(model, dataset, T, prompt)
multiple_images(model, dataset, T, prompt)
"""
single image 511.1411769390106
single image 1.0843024253845215
single image 0.03383207321166992
single image 0.0336606502532959
multiple_images 8.789534568786621
multiple_images fps 15.928033379283555
multiple_images 8.79971957206726
multiple_images fps 15.909597897232844
multiple_images 8.794561862945557
multiple_images fps 15.918928331139158
multiple_images 8.796127080917358
multiple_images fps 15.916095653474715
"""
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
21 changes: 14 additions & 7 deletions img2img-turbo/src/model.py → img2img_turbo/src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@


def make_1step_sched():
noise_scheduler_1step = DDPMScheduler.from_pretrained("stabilityai/sd-turbo", subfolder="scheduler")
noise_scheduler_1step = DDPMScheduler.from_pretrained(
"stabilityai/sd-turbo", subfolder="scheduler"
)
noise_scheduler_1step.set_timesteps(1, device="cuda")
noise_scheduler_1step.alphas_cumprod = noise_scheduler_1step.alphas_cumprod.cuda()
return noise_scheduler_1step
Expand All @@ -29,12 +31,17 @@ def my_vae_encoder_fwd(self, sample):

def my_vae_decoder_fwd(self, sample, latent_embeds=None):
sample = self.conv_in(sample)
upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
# upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
# middle
sample = self.mid_block(sample, latent_embeds)
sample = sample.to(upscale_dtype)
# sample = sample.to(upscale_dtype)
if not self.ignore_skip:
skip_convs = [self.skip_conv_1, self.skip_conv_2, self.skip_conv_3, self.skip_conv_4]
skip_convs = [
self.skip_conv_1,
self.skip_conv_2,
self.skip_conv_3,
self.skip_conv_4,
]
# up
for idx, up_block in enumerate(self.up_blocks):
skip_in = skip_convs[idx](self.incoming_skip_acts[::-1][idx] * self.gamma)
Expand All @@ -58,10 +65,10 @@ def download_url(url, outf):
if not os.path.exists(outf):
print(f"Downloading checkpoint to {outf}")
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get('content-length', 0))
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024 # 1 Kibibyte
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open(outf, 'wb') as file:
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(outf, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,17 @@
from diffusers.utils.peft_utils import set_weights_and_activate_adapters
from peft import LoraConfig

p = "src/"
sys.path.append(p)
from model import make_1step_sched, my_vae_encoder_fwd, my_vae_decoder_fwd

from .model import make_1step_sched, my_vae_encoder_fwd, my_vae_decoder_fwd
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy as np
import torch
from diffusers import AutoencoderTiny, StableDiffusionPipeline


class TwinConv(torch.nn.Module):
Expand Down Expand Up @@ -41,11 +49,28 @@ def __init__(
"stabilityai/sd-turbo", subfolder="tokenizer"
)
self.text_encoder = CLIPTextModel.from_pretrained(
"stabilityai/sd-turbo", subfolder="text_encoder"
"stabilityai/sd-turbo",
subfolder="text_encoder",
torch_dtype=torch.bfloat16,
).cuda()
self.sched = make_1step_sched()
self.sched.betas = self.sched.betas.to(torch.bfloat16).cuda()
self.sched.alphas = self.sched.alphas.to(torch.bfloat16).cuda()
self.sched.one = self.sched.one.to(torch.bfloat16).cuda()
self.sched.alphas_cumprod = self.sched.alphas_cumprod.to(torch.bfloat16).cuda()

vae = AutoencoderKL.from_pretrained("stabilityai/sd-turbo", subfolder="vae")
vae = AutoencoderKL.from_pretrained(
"stabilityai/sd-turbo",
subfolder="vae",
variant="fp16",
torch_dtype=torch.bfloat16,
)
# это можно пофиксить если задать другие ключи для Sequential, тогда он будет правильно выбирать адаптеры
# https://github.com/huggingface/peft/blob/b345a6e41521b977793cbdcaf932280081b18141/docs/source/developer_guides/custom_models.md?plain=1#L69
# vae = AutoencoderTiny.from_pretrained("madebyollin/taesd").to(
# device="cuda",
# dtype=torch.bfloat16,
# )
vae.encoder.forward = my_vae_encoder_fwd.__get__(
vae.encoder, vae.encoder.__class__
)
Expand All @@ -67,7 +92,10 @@ def __init__(
).cuda()
vae.decoder.ignore_skip = False
unet = UNet2DConditionModel.from_pretrained(
"stabilityai/sd-turbo", subfolder="unet"
"stabilityai/sd-turbo",
subfolder="unet",
variant="fp16",
torch_dtype=torch.bfloat16,
)

if pretrained_name == "edge_to_image":
Expand Down Expand Up @@ -236,14 +264,16 @@ def __init__(
self.target_modules_vae = target_modules_vae
self.target_modules_unet = target_modules_unet

# unet.enable_xformers_memory_efficient_attention()
unet.enable_xformers_memory_efficient_attention()
unet.to("cuda")
vae.to("cuda")
self.unet, self.vae = unet, vae
self.vae.decoder.gamma = 1
self.timesteps = torch.tensor([999], device="cuda").long()
self.text_encoder.requires_grad_(False)

self.cache_prompts = {}

def set_eval(self):
self.unet.eval()
self.vae.eval()
Expand Down Expand Up @@ -337,6 +367,58 @@ def forward(
).clamp(-1, 1)
return output_image

def custom_forward(
self,
c_t,
prompt=None,
prompt_tokens=None,
deterministic=True,
r=1.0,
noise_map=None,
):

if prompt in self.cache_prompts:
caption_enc = self.cache_prompts[prompt]
else:
caption_tokens = self.tokenizer(
prompt,
max_length=self.tokenizer.model_max_length,
padding="max_length",
truncation=True,
return_tensors="pt",
).input_ids.cuda()
caption_enc = self.text_encoder(caption_tokens)[0]
self.cache_prompts[prompt] = caption_enc

encoded_control = (
# torch.Size([1, 4, 64, 64])
self.vae.encode(c_t, return_dict=False)[0].sample()
# self.vae.encode(c_t, return_dict=False)[0]
* self.vae.config.scaling_factor
)
model_pred = self.unet(
encoded_control,
self.timesteps,
encoder_hidden_states=caption_enc,
return_dict=False,
)[0]
x_denoised = self.sched.step(
model_pred,
self.timesteps,
encoded_control,
return_dict=False,
)[0]
# x_denoised = x_denoised.to(model_pred.dtype)
self.vae.decoder.incoming_skip_acts = self.vae.encoder.current_down_blocks
output_image = (
self.vae.decode(
x_denoised / self.vae.config.scaling_factor,
return_dict=False,
)[0]
).clamp(-1, 1)

return output_image

def save_model(self, outf):
sd = {}
sd["unet_lora_target_modules"] = self.target_modules_unet
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
5 changes: 5 additions & 0 deletions papers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- [StreamDiffusion: A Pipeline-level Solution for Real-time Interactive Generation](https://arxiv.org/pdf/2312.12491)
- [Immiscible Diffusion: Accelerating Diffusion Training with Noise Assignment](https://arxiv.org/pdf/2406.12303)
- [Live2Diff: Live Stream Translation via Uni-directional Attention in Video Diffusion Models](https://live2diff.github.io/)

- [создание кастомной LCM](https://github.com/huggingface/diffusers/tree/main/examples/consistency_distillation)

0 comments on commit a059469

Please sign in to comment.