[SD-XL] Add new pipelines (huggingface#3859)

* Add new text encoder * add transformers depth * More * Correct conversion script * Fix more * Fix more * Correct more * correct text encoder * Finish all * proof that in works in run local xl * clean up * Get refiner to work * Add red castle * Fix batch size * Improve pipelines more * Finish text2image tests * Add img2img test * Fix more * fix import * Fix embeddings for classic models (huggingface#3888) Fix embeddings for classic SD models. * Allow multiple prompts to be passed to the refiner (huggingface#3895) * finish more * Apply suggestions from code review * add watermarker * Model offload (huggingface#3889) * Model offload. * Model offload for refiner / img2img * Hardcode encoder offload on img2img vae encode Saves some GPU RAM in img2img / refiner tasks so it remains below 8 GB. --------- Co-authored-by: Patrick von Platen <[email protected]> * correct * fix * clean print * Update install warning for `invisible-watermark` * add: missing docstrings. * fix and simplify the usage example in img2img. * fix setup for watermarking. * Revert "fix setup for watermarking." This reverts commit 491bc9f. * fix: watermarking setup. * fix: op. * run make fix-copies. * make sure tests pass * improve convert * make tests pass * make tests pass * better error message * fiinsh * finish * Fix final test --------- Co-authored-by: Pedro Cuenca <[email protected]> Co-authored-by: Sayak Paul <[email protected]>
petrzjunior · Jul 6, 2023 · bc9a8ce · bc9a8ce
1 parent b62d9a1
commit bc9a8ce
Show file tree

Hide file tree

Showing 28 changed files with 2,512 additions and 61 deletions.
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
@@ -9,13 +9,20 @@ on:
       - v*-patch
 
 jobs:
-   build:
-    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
-    with:
-      commit_sha: ${{ github.sha }}
-      package: diffusers
-      notebook_folder: diffusers_doc
-      languages: en ko zh
+  build:
+    steps:
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+
+      - name: Build doc
+        uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+        with:
+          commit_sha: ${{ github.sha }}
+          package: diffusers
+          notebook_folder: diffusers_doc
+          languages: en ko zh
+
     secrets:
       token: ${{ secrets.HUGGINGFACE_PUSH }}
       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
@@ -9,9 +9,15 @@ concurrency:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
-    with:
-      commit_sha: ${{ github.event.pull_request.head.sha }}
-      pr_number: ${{ github.event.number }}
-      package: diffusers
-      languages: en ko
+    steps:
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+
+      - name: Build doc
+        uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+        with:
+          commit_sha: ${{ github.event.pull_request.head.sha }}
+          pr_number: ${{ github.event.number }}
+          package: diffusers
+          languages: en ko zh
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
@@ -62,7 +62,7 @@ jobs:
 
     - name: Install dependencies
       run: |
-        apt-get update && apt-get install libsndfile1-dev -y
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
         python -m pip install -e .[quality,test]
 
     - name: Environment

diff --git a/docker/diffusers-pytorch-cpu/Dockerfile b/docker/diffusers-pytorch-cpu/Dockerfile
@@ -14,6 +14,7 @@ RUN apt update && \
                    libsndfile1-dev \
                    python3.8 \
                    python3-pip \
+                   libgl1 \
                    python3.8-venv && \
     rm -rf /var/lib/apt/lists
 
@@ -27,6 +28,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
         torch \
         torchvision \
         torchaudio \
+        invisible_watermark \
         --extra-index-url https://download.pytorch.org/whl/cpu && \
     python3 -m pip install --no-cache-dir \
         accelerate \
@@ -40,4 +42,4 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
         tensorboard \
         transformers
 
-CMD ["/bin/bash"]
+CMD ["/bin/bash"]
diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -12,6 +12,7 @@ RUN apt update && \
                    curl \
                    ca-certificates \
                    libsndfile1-dev \
+                   libgl1 \
                    python3.8 \
                    python3-pip \
                    python3.8-venv && \
@@ -26,7 +27,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
     python3 -m pip install --no-cache-dir \
         torch \
         torchvision \
-        torchaudio && \
+        torchaudio \
+        invisible_watermark && \
     python3 -m pip install --no-cache-dir \
         accelerate \
         datasets \

diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx
@@ -0,0 +1,42 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Stable diffusion XL
+
+Stable Diffusion 2 is a text-to-image _latent diffusion_ model built upon the work of [Stable Diffusion 1](https://stability.ai/blog/stable-diffusion-public-release). 
+The project to train Stable Diffusion 2 was led by Robin Rombach and Katherine Crowson from [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/).
+
+*The Stable Diffusion 2.0 release includes robust text-to-image models trained using a brand new text encoder (OpenCLIP), developed by LAION with support from Stability AI, which greatly improves the quality of the generated images compared to earlier V1 releases. The text-to-image models in this release can generate images with default resolutions of both 512x512 pixels and 768x768 pixels. 
+These models are trained on an aesthetic subset of the [LAION-5B dataset](https://laion.ai/blog/laion-5b/) created by the DeepFloyd team at Stability AI, which is then further filtered to remove adult content using [LAION’s NSFW filter](https://openreview.net/forum?id=M3Y74vmsMcY).*
+
+For more details about how Stable Diffusion 2 works and how it differs from Stable Diffusion 1, please refer to the official [launch announcement post](https://stability.ai/blog/stable-diffusion-v2-release).
+
+## Tips
+
+### Available checkpoints:
+
+- *Text-to-Image (1024x1024 resolution)*: [stabilityai/stable-diffusion-xl-base-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9) with [`StableDiffusionXLPipeline`]
+- *Image-to-Image / Refiner (1024x1024 resolution)*: [stabilityai/stable-diffusion-xl-refiner-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9) with [`StableDiffusionXLImg2ImgPipeline`]
+
+TODO
+
+## StableDiffusionXLPipeline
+
+[[autodoc]] StableDiffusionXLPipeline
+	- all
+	- __call__
+
+## StableDiffusionXLImg2ImgPipeline
+
+[[autodoc]] StableDiffusionXLImg2ImgPipeline
+	- all
+	- __call__
diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -126,6 +126,13 @@
         "--controlnet", action="store_true", default=None, help="Set flag if this is a controlnet checkpoint."
     )
     parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+    parser.add_argument(
+        "--vae_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Set to a path, hub id to an already converted vae to not convert it again.",
+    )
     args = parser.parse_args()
 
     pipe = download_from_original_stable_diffusion_ckpt(
@@ -144,6 +151,7 @@
         stable_unclip_prior=args.stable_unclip_prior,
         clip_stats_path=args.clip_stats_path,
         controlnet=args.controlnet,
+        vae_path=args.vae_path,
     )
 
     if args.half:

diff --git a/setup.py b/setup.py
@@ -89,6 +89,7 @@
     "huggingface-hub>=0.13.2",
     "requests-mock==1.10.0",
     "importlib_metadata",
+    "invisible-watermark",
     "isort>=5.5.4",
     "jax>=0.2.8,!=0.3.2",
     "jaxlib>=0.1.65",
@@ -193,6 +194,7 @@ def run(self):
     "compel",
     "datasets",
     "Jinja2",
+    "invisible-watermark",
     "k-diffusion",
     "librosa",
     "omegaconf",

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -5,6 +5,7 @@
     OptionalDependencyNotAvailable,
     is_flax_available,
     is_inflect_available,
+    is_invisible_watermark_available,
     is_k_diffusion_available,
     is_k_diffusion_version,
     is_librosa_available,
@@ -179,6 +180,14 @@
         VQDiffusionPipeline,
     )
 
+try:
+    if not (is_torch_available() and is_transformers_available() and is_invisible_watermark_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils.dummy_torch_and_transformers_and_invisible_watermark_objects import *  # noqa F403
+else:
+    from .pipelines import StableDiffusionXLImg2ImgPipeline, StableDiffusionXLPipeline
+
 try:
     if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
         raise OptionalDependencyNotAvailable()

diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
@@ -13,6 +13,7 @@
     "huggingface-hub": "huggingface-hub>=0.13.2",
     "requests-mock": "requests-mock==1.10.0",
     "importlib_metadata": "importlib_metadata",
+    "invisible-watermark": "invisible-watermark",
     "isort": "isort>=5.5.4",
     "jax": "jax>=0.2.8,!=0.3.2",
     "jaxlib": "jaxlib>=0.1.65",

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -1118,7 +1118,9 @@ def __call__(
         value = attn.to_v(encoder_hidden_states)
 
         head_dim = inner_dim // attn.heads
+
         query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
         key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
         value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
 

diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
@@ -38,6 +38,7 @@ def get_down_block(
     add_downsample,
     resnet_eps,
     resnet_act_fn,
+    transformer_layers_per_block=1,
     num_attention_heads=None,
     resnet_groups=None,
     cross_attention_dim=None,
@@ -111,6 +112,7 @@ def get_down_block(
             raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
         return CrossAttnDownBlock2D(
             num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
@@ -232,6 +234,7 @@ def get_up_block(
     add_upsample,
     resnet_eps,
     resnet_act_fn,
+    transformer_layers_per_block=1,
     num_attention_heads=None,
     resnet_groups=None,
     cross_attention_dim=None,
@@ -287,6 +290,7 @@ def get_up_block(
             raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
         return CrossAttnUpBlock2D(
             num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
             in_channels=in_channels,
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
@@ -517,6 +521,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -559,7 +564,7 @@ def __init__(
                         num_attention_heads,
                         in_channels // num_attention_heads,
                         in_channels=in_channels,
-                        num_layers=1,
+                        num_layers=transformer_layers_per_block,
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
@@ -862,6 +867,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -906,7 +912,7 @@ def __init__(
                         num_attention_heads,
                         out_channels // num_attention_heads,
                         in_channels=out_channels,
-                        num_layers=1,
+                        num_layers=transformer_layers_per_block,
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
@@ -1995,6 +2001,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -2040,7 +2047,7 @@ def __init__(
                         num_attention_heads,
                         out_channels // num_attention_heads,
                         in_channels=out_channels,
-                        num_layers=1,
+                        num_layers=transformer_layers_per_block,
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,