[SVD] fix a bug when passing image as tensor (huggingface#6999)

* fix * update docstring --------- Co-authored-by: yiyixuxu <yixu310@gmail,com>
EMinsight · Feb 18, 2024 · 8974c50 · 8974c50
1 parent c18058b
commit 8974c50
Showing 1 changed file with 10 additions and 11 deletions.
diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -132,15 +132,15 @@ def _encode_image(self, image, device, num_videos_per_prompt, do_classifier_free
             image = _resize_with_antialiasing(image, (224, 224))
             image = (image + 1.0) / 2.0
 
-            # Normalize the image with for CLIP input
-            image = self.feature_extractor(
-                images=image,
-                do_normalize=True,
-                do_center_crop=False,
-                do_resize=False,
-                do_rescale=False,
-                return_tensors="pt",
-            ).pixel_values
+        # Normalize the image with for CLIP input
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
 
         image = image.to(device=device, dtype=dtype)
         image_embeddings = self.image_encoder(image).image_embeds
@@ -333,8 +333,7 @@ def __call__(
 
         Args:
             image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
-                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
-                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+                Image or images to guide image generation. If you provide a tensor, the expected value range is between `[0,1]`.
             height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):