From 3768ed9876dee8ec38ae1ab767a9748e98c26251 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 16 Mar 2023 01:31:55 +0000 Subject: [PATCH 01/25] refactor post processing with imageprocessor (update decode_latents/run_safety_checker) and all pipeliens copied these 2 methods --- .../alt_diffusion/pipeline_alt_diffusion.py | 41 ++++++++-------- .../pipeline_alt_diffusion_img2img.py | 19 ++++--- .../pipeline_paint_by_example.py | 44 ++++++++++------- .../pipeline_semantic_stable_diffusion.py | 46 +++++++++++------ .../pipeline_cycle_diffusion.py | 43 ++++++++++------ .../pipeline_stable_diffusion.py | 41 ++++++++-------- ...line_stable_diffusion_attend_and_excite.py | 45 ++++++++++------- .../pipeline_stable_diffusion_controlnet.py | 44 +++++++++-------- .../pipeline_stable_diffusion_depth2img.py | 34 +++++++------ ...peline_stable_diffusion_image_variation.py | 42 ++++++++++------ .../pipeline_stable_diffusion_img2img.py | 28 ++++++----- .../pipeline_stable_diffusion_inpaint.py | 42 ++++++++++------ ...ipeline_stable_diffusion_inpaint_legacy.py | 42 ++++++++++------ ...eline_stable_diffusion_instruct_pix2pix.py | 42 ++++++++++------ .../pipeline_stable_diffusion_k_diffusion.py | 44 ++++++++++------- ...ipeline_stable_diffusion_latent_upscale.py | 24 ++++++--- .../pipeline_stable_diffusion_panorama.py | 44 ++++++++++------- .../pipeline_stable_diffusion_pix2pix_zero.py | 49 ++++++++++++------- .../pipeline_stable_diffusion_sag.py | 44 ++++++++++------- .../pipeline_stable_diffusion_upscale.py | 27 ++++++---- .../pipeline_stable_unclip.py | 24 ++++++--- .../pipeline_stable_unclip_img2img.py | 22 ++++++--- .../stable_diffusion/safety_checker.py | 5 +- .../pipeline_stable_diffusion_safe.py | 1 - ...ipeline_versatile_diffusion_dual_guided.py | 22 ++++++--- ...ine_versatile_diffusion_image_variation.py | 20 +++++--- ...eline_versatile_diffusion_text_to_image.py | 20 +++++--- 27 files changed, 549 insertions(+), 350 deletions(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index b94a2ec05649..5a4662c1a40c 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -22,6 +22,7 @@ from diffusers.utils import is_accelerate_available, is_accelerate_version from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, logging, randn_tensor, replace_example_docstring @@ -165,6 +166,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) def enable_vae_slicing(self): @@ -409,21 +411,17 @@ def _encode_prompt( return prompt_embeds def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def prepare_extra_step_kwargs(self, generator, eta): @@ -682,24 +680,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + if output_type == "latent": image = latents has_nsfw_concept = None - elif output_type == "pil": - # 8. Post-processing - image = self.decode_latents(latents) - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - - # 10. Convert to PIL - image = self.numpy_to_pil(image) else: - # 8. Post-processing image = self.decode_latents(latents) - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False + + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 05138c86f246..8082838ca158 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -193,6 +193,7 @@ def __init__( new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) + self.register_modules( vae=vae, text_encoder=text_encoder, @@ -203,11 +204,8 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - self.register_to_config( - requires_safety_checker=requires_safety_checker, - ) + self.register_to_config(requires_safety_checker=requires_safety_checker) def enable_sequential_cpu_offload(self, gpu_id=0): r""" @@ -715,14 +713,15 @@ def __call__( image = latents has_nsfw_concept = None - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) else: - has_nsfw_concept = False + image = self.decode_latents(latents) + + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index 353805228671..2844d373acca 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -22,9 +22,10 @@ from diffusers.utils import is_accelerate_available +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler -from ...utils import logging, randn_tensor +from ...utils import deprecate, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -184,6 +185,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) def enable_sequential_cpu_offload(self, gpu_id=0): @@ -226,13 +228,11 @@ def _execution_device(self): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -258,8 +258,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs @@ -560,15 +558,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 11. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" - # 12. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype) + if output_type == "latent": + image = latents + has_nsfw_concept = None + + else: + image = self.decode_latents(latents) + + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 13. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index a421a844c329..d876b68a6047 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -5,11 +5,13 @@ import torch from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor + from ...models import AutoencoderKL, UNet2DConditionModel from ...pipeline_utils import DiffusionPipeline from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from ...schedulers import KarrasDiffusionSchedulers -from ...utils import logging, randn_tensor +from ...utils import deprecate, logging, randn_tensor from . import SemanticStableDiffusionPipelineOutput @@ -129,15 +131,23 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -680,21 +690,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 8. Post-processing - image = self.decode_latents(latents) - - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to( - self.device + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" ) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype) - ) - else: + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents has_nsfw_concept = None - if output_type == "pil": - image = self.numpy_to_pil(image) + else: + image = self.decode_latents(latents) + + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False + + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index e977071b9c6c..f5652c26b007 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -24,6 +24,7 @@ from diffusers.utils import is_accelerate_available, is_accelerate_version from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor @@ -219,6 +220,8 @@ def __init__( safety_checker=safety_checker, feature_extractor=feature_extractor, ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload @@ -495,13 +498,11 @@ def prepare_extra_step_kwargs(self, generator, eta): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -509,8 +510,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps @@ -760,15 +759,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 9. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + has_nsfw_concept = None + + else: + image = self.decode_latents(latents) - # 10. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 11. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 5294fa4cfa06..ff07eb21d76b 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -20,6 +20,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -168,6 +169,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) def enable_vae_slicing(self): @@ -412,21 +414,17 @@ def _encode_prompt( return prompt_embeds def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def prepare_extra_step_kwargs(self, generator, eta): @@ -685,24 +683,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + if output_type == "latent": image = latents has_nsfw_concept = None - elif output_type == "pil": - # 8. Post-processing - image = self.decode_latents(latents) - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - - # 10. Convert to PIL - image = self.numpy_to_pil(image) else: - # 8. Post-processing image = self.decode_latents(latents) - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False + + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index 2d32c0ba8b62..0b2078c5557e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -21,10 +21,12 @@ from torch.nn import functional as F from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor + from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention from ...schedulers import KarrasDiffusionSchedulers -from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring +from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -227,6 +229,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing @@ -433,13 +436,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -447,8 +448,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -960,15 +959,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 8. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + has_nsfw_concept = None - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = self.decode_latents(latents) + + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 10. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py index fd82281005ad..28b52c4ddb8f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py @@ -23,12 +23,14 @@ from torch import nn from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...models.controlnet import ControlNetOutput from ...models.modeling_utils import ModelMixin from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( PIL_INTERPOLATION, + deprecate, is_accelerate_available, is_accelerate_version, logging, @@ -223,6 +225,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing @@ -452,13 +455,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -466,8 +467,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -951,24 +950,27 @@ def __call__( self.controlnet.to("cpu") torch.cuda.empty_cache() + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + if output_type == "latent": image = latents has_nsfw_concept = None - elif output_type == "pil": - # 8. Post-processing - image = self.decode_latents(latents) - - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - # 10. Convert to PIL - image = self.numpy_to_pil(image) - else: - # 8. Post-processing + else: image = self.decode_latents(latents) - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False + + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 9087064ae0b8..c9f0f1e00027 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -23,6 +23,7 @@ from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, logging, randn_tensor @@ -120,6 +121,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) def enable_sequential_cpu_offload(self, gpu_id=0): r""" @@ -298,13 +300,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -312,8 +312,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -676,12 +674,20 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 10. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents - # 11. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + else: + image = self.decode_latents(latents) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index a7165457c67c..6f337d7db363 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -21,6 +21,7 @@ from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, is_accelerate_available, logging, randn_tensor @@ -118,6 +119,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) def enable_sequential_cpu_offload(self, gpu_id=0): @@ -183,13 +185,11 @@ def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -197,8 +197,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -398,15 +396,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 8. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + has_nsfw_concept = None + + else: + image = self.decode_latents(latents) - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype) + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 10. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 8b3a7944def1..4624f9c7492e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -119,7 +119,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ _optional_components = ["safety_checker", "feature_extractor"] - + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__ def __init__( self, vae: AutoencoderKL, @@ -196,6 +197,7 @@ def __init__( new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) + self.register_modules( vae=vae, text_encoder=text_encoder, @@ -206,11 +208,8 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - self.register_to_config( - requires_safety_checker=requires_safety_checker, - ) + self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload def enable_sequential_cpu_offload(self, gpu_id=0): @@ -424,7 +423,8 @@ def _encode_prompt( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds - + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) @@ -432,7 +432,8 @@ def run_safety_checker(self, image, device, dtype): images=image, clip_input=safety_checker_input.pixel_values.to(dtype) ) return image, has_nsfw_concept - + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample @@ -723,14 +724,15 @@ def __call__( image = latents has_nsfw_concept = None - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) else: - has_nsfw_concept = False + image = self.decode_latents(latents) + + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index b645ba667f77..7b9b1dde4c4a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -22,6 +22,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor @@ -254,6 +255,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload @@ -471,13 +473,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -503,8 +503,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs @@ -870,15 +868,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 11. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + has_nsfw_concept = None + + else: + image = self.decode_latents(latents) - # 12. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 13. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index a770fb18aaa0..9c25390e560f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -22,6 +22,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -198,6 +199,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload @@ -415,13 +417,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -429,8 +429,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -692,16 +690,28 @@ def __call__( # use original latents corresponding to unmasked portions of the image latents = (init_latents_orig * mask) + (latents * (1 - mask)) + + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" - # 10. Post-processing - image = self.decode_latents(latents) + if output_type == "latent": + image = latents + has_nsfw_concept = None - # 11. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = self.decode_latents(latents) + + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 12. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 953df11aa4f7..be3b594f3c09 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -20,6 +20,7 @@ import torch from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -128,6 +129,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) @torch.no_grad() @@ -376,15 +378,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 10. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + has_nsfw_concept = None + + else: + image = self.decode_latents(latents) - # 11. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 12. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: @@ -610,13 +624,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -642,8 +654,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def check_inputs( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index f3db54caa342..ae5616d1a3f3 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -18,9 +18,10 @@ import torch from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser +from ...image_processor import VaeImageProcessor from ...pipelines import DiffusionPipeline from ...schedulers import LMSDiscreteScheduler -from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor +from ...utils import is_accelerate_available, is_accelerate_version, deprecate, logging, randn_tensor from . import StableDiffusionPipelineOutput @@ -109,6 +110,7 @@ def __init__( ) self.register_to_config(requires_safety_checker=requires_safety_checker) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) model = ModelWrapper(unet, scheduler.alphas_cumprod) if scheduler.prediction_type == "v_prediction": @@ -336,13 +338,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -350,8 +350,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def check_inputs(self, prompt, height, width, callback_steps): @@ -527,15 +525,27 @@ def model_fn(x, t): # 7. Run k-diffusion solver latents = self.sampler(model_fn, latents, sigmas) - # 8. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if output_type == "latent": + image = latents + has_nsfw_concept = None + + else: + image = self.decode_latents(latents) + + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 10. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 624d0e625828..038a66443826 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -20,9 +20,10 @@ import torch.nn.functional as F from transformers import CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import EulerDiscreteScheduler -from ...utils import is_accelerate_available, logging, randn_tensor +from ...utils import is_accelerate_available, deprecate, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput @@ -91,6 +92,8 @@ def __init__( unet=unet, scheduler=scheduler, ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) def enable_sequential_cpu_offload(self, gpu_id=0): r""" @@ -223,8 +226,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def check_inputs(self, prompt, image, callback_steps): @@ -505,12 +506,19 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 10. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" - # 11. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + if output_type == "latent": + image = latents + else: + image = self.decode_latents(latents) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index 3fea4c2d83bb..c9770f0d2a33 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -17,9 +17,10 @@ import torch from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler, PNDMScheduler -from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring +from ...utils import is_accelerate_available, is_accelerate_version, deprecate, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -122,6 +123,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing @@ -328,13 +330,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -342,8 +342,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -648,15 +646,27 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 8. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if output_type == "latent": + image = latents + has_nsfw_concept = None + + else: + image = self.decode_latents(latents) + + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 10. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 7de12bd291fb..26e6632f06ce 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -28,6 +28,7 @@ CLIPTokenizer, ) +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention from ...schedulers import DDIMScheduler, DDPMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler @@ -40,6 +41,7 @@ logging, randn_tensor, replace_example_docstring, + deprecate ) from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput @@ -356,6 +358,7 @@ def __init__( inverse_scheduler=inverse_scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload @@ -568,13 +571,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -582,8 +583,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -1032,24 +1031,36 @@ def __call__( if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() - # 11. Post-process the latents. - edited_image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + has_nsfw_concept = None + + else: + image = self.decode_latents(latents) - # 12. Run the safety checker. - edited_image, has_nsfw_concept = self.run_safety_checker(edited_image, device, prompt_embeds.dtype) + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 13. Convert to PIL. - if output_type == "pil": - edited_image = self.numpy_to_pil(edited_image) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.final_offload_hook.offload() if not return_dict: - return (edited_image, has_nsfw_concept) + return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput(images=edited_image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) @torch.no_grad() @replace_example_docstring(EXAMPLE_INVERT_DOC_STRING) @@ -1246,7 +1257,7 @@ def invert( # 9. Convert to PIL. if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type='pil') if not return_dict: return (inverted_latents, image) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index b24354a8e568..1b87fab8ad56 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -19,9 +19,10 @@ import torch.nn.functional as F from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring +from ...utils import is_accelerate_available, is_accelerate_version, deprecate, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -139,6 +140,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing @@ -345,13 +347,11 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) - else: - has_nsfw_concept = None + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -359,8 +359,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -673,15 +671,27 @@ def get_map_size(module, input, output): if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 8. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if output_type == "latent": + image = latents + has_nsfw_concept = None + + else: + image = self.decode_latents(latents) + + if self.safety_checker is not None: + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False - # 10. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 9f8f44a12bb4..411fc71a4da3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -20,6 +20,7 @@ import torch from transformers import CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers from ...utils import deprecate, is_accelerate_available, logging, randn_tensor @@ -113,6 +114,8 @@ def __init__( low_res_scheduler=low_res_scheduler, scheduler=scheduler, ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(max_noise_level=max_noise_level) def enable_sequential_cpu_offload(self, gpu_id=0): @@ -313,8 +316,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def check_inputs(self, prompt, image, noise_level, callback_steps): @@ -580,13 +581,21 @@ def __call__( callback(i, t, latents) # 10. Post-processing - # make sure the VAE is in float32 mode, as it overflows in float16 - self.vae.to(dtype=torch.float32) - image = self.decode_latents(latents.float()) - - # 11. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + else: + # make sure the VAE is in float32 mode, as it overflows in float16 + self.vae.to(dtype=torch.float32) + image = self.decode_latents(latents.float()) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index a8ba0b504628..06e8612bedec 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -19,10 +19,11 @@ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer from transformers.models.clip.modeling_clip import CLIPTextModelOutput +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers -from ...utils import is_accelerate_available, logging, randn_tensor, replace_example_docstring +from ...utils import is_accelerate_available, logging, randn_tensor, replace_example_docstring, deprecate from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer @@ -135,6 +136,7 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing def enable_vae_slicing(self): @@ -443,8 +445,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs with prepare_extra_step_kwargs->prepare_prior_extra_step_kwargs, scheduler->prior_scheduler @@ -881,12 +881,20 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 14. Post-processing - image = self.decode_latents(latents) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents - # 15. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + else: + image = self.decode_latents(latents) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 99caa8be65a5..13cd1d4494f5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -21,6 +21,7 @@ from diffusers.utils.import_utils import is_accelerate_available +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers @@ -137,6 +138,7 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing def enable_vae_slicing(self): @@ -398,8 +400,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -774,12 +774,20 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 9. Post-processing - image = self.decode_latents(latents) + # 9. Post-processing + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" - # 10. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + if output_type == "latent": + image = latents + else: + image = self.decode_latents(latents) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py index 84b8aeb7bcde..a89a7af17c38 100644 --- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py @@ -85,7 +85,10 @@ def forward(self, clip_input, images): for idx, has_nsfw_concept in enumerate(has_nsfw_concepts): if has_nsfw_concept: - images[idx] = np.zeros(images[idx].shape) # black image + if isinstance(images, np.ndarray): + images[idx] = np.zeros(images[idx].shape) # black image + elif isinstance(images, torch.Tensor): + images[idx] = torch.zeros(images[idx].shape) if any(has_nsfw_concepts): logger.warning( diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py index 3d0ddce7157e..626acf5a6207 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py @@ -361,7 +361,6 @@ def run_safety_checker(self, image, device, dtype, enable_safety_guidance): flagged_images = None return image, has_nsfw_concept, flagged_images - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index 529d9a2ae9c0..83aa2e54cd85 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -26,9 +26,10 @@ CLIPVisionModelWithProjection, ) +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import is_accelerate_available, logging, randn_tensor +from ...utils import is_accelerate_available, deprecate, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .modeling_text_unet import UNetFlatConditionModel @@ -88,6 +89,7 @@ def __init__( scheduler=scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) if self.text_unet is not None and ( "dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention @@ -332,8 +334,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -572,12 +572,20 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 9. Post-processing + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + image = self.decode_latents(latents) - # 10. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index fd6855af3852..6b362eaa2510 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -21,6 +21,7 @@ import torch.utils.checkpoint from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import is_accelerate_available, logging, randn_tensor @@ -71,6 +72,7 @@ def __init__( scheduler=scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) def enable_sequential_cpu_offload(self, gpu_id=0): r""" @@ -192,8 +194,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -414,12 +414,20 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 8. Post-processing + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + image = self.decode_latents(latents) - # 9. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index d1bb754c7b58..8da929e55e1c 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -19,6 +19,7 @@ import torch.utils.checkpoint from transformers import CLIPFeatureExtractor, CLIPTextModelWithProjection, CLIPTokenizer +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import is_accelerate_available, logging, randn_tensor @@ -76,6 +77,7 @@ def __init__( scheduler=scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) if self.text_unet is not None: self._swap_unet_attention_blocks() @@ -249,8 +251,6 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -488,12 +488,20 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 9. Post-processing + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + image = latents + image = self.decode_latents(latents) - # 10. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) From 71eda72bbec5b775217e0055baa9f6e9106de4c6 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 31 Mar 2023 21:33:19 +0000 Subject: [PATCH 02/25] fix --- .../paint_by_example/pipeline_paint_by_example.py | 2 +- .../pipeline_semantic_stable_diffusion.py | 5 ++--- .../stable_diffusion/pipeline_cycle_diffusion.py | 2 +- .../pipeline_stable_diffusion_attend_and_excite.py | 10 ++++++++-- .../pipeline_stable_diffusion_controlnet.py | 2 +- .../pipeline_stable_diffusion_image_variation.py | 2 +- .../pipeline_stable_diffusion_img2img.py | 6 +++--- .../pipeline_stable_diffusion_inpaint_legacy.py | 2 +- .../pipeline_stable_diffusion_k_diffusion.py | 2 +- .../pipeline_stable_diffusion_latent_upscale.py | 2 +- .../pipeline_stable_diffusion_panorama.py | 9 ++++++++- .../pipeline_stable_diffusion_pix2pix_zero.py | 4 ++-- .../stable_diffusion/pipeline_stable_diffusion_sag.py | 9 ++++++++- .../pipeline_stable_diffusion_upscale.py | 2 +- .../stable_diffusion/pipeline_stable_unclip.py | 2 +- .../stable_diffusion/pipeline_stable_unclip_img2img.py | 4 ++-- .../pipeline_versatile_diffusion_dual_guided.py | 8 ++++---- .../pipeline_versatile_diffusion_image_variation.py | 8 ++++---- .../pipeline_versatile_diffusion_text_to_image.py | 8 ++++---- 19 files changed, 54 insertions(+), 35 deletions(-) diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index 2844d373acca..65b588a83c8e 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -574,7 +574,7 @@ def __call__( image = self.decode_latents(latents) if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype) else: has_nsfw_concept = False diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index d876b68a6047..42af7137a9d1 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -6,7 +6,6 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor - from ...models import AutoencoderKL, UNet2DConditionModel from ...pipeline_utils import DiffusionPipeline from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker @@ -133,7 +132,7 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) - + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") @@ -706,7 +705,7 @@ def __call__( image = self.decode_latents(latents) if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embedding.dtype) else: has_nsfw_concept = False diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index f5652c26b007..4e516b7e87d5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -779,7 +779,7 @@ def __call__( else: has_nsfw_concept = False - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index 0b2078c5557e..785a6ee39759 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -22,11 +22,17 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor - from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention from ...schedulers import KarrasDiffusionSchedulers -from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring +from ...utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py index 28b52c4ddb8f..1131b8754a5c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py @@ -962,7 +962,7 @@ def __call__( image = latents has_nsfw_concept = None - else: + else: image = self.decode_latents(latents) if self.safety_checker is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index 6f337d7db363..04562239da04 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -412,7 +412,7 @@ def __call__( image = self.decode_latents(latents) if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype) else: has_nsfw_concept = False diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 4624f9c7492e..f90a93a30b92 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -119,7 +119,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ _optional_components = ["safety_checker", "feature_extractor"] - + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__ def __init__( self, @@ -423,7 +423,7 @@ def _encode_prompt( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds - + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") @@ -432,7 +432,7 @@ def run_safety_checker(self, image, device, dtype): images=image, clip_input=safety_checker_input.pixel_values.to(dtype) ) return image, has_nsfw_concept - + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 9c25390e560f..6635c8720763 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -690,7 +690,7 @@ def __call__( # use original latents corresponding to unmasked portions of the image latents = (init_latents_orig * mask) + (latents * (1 - mask)) - + if output_type not in ["latent", "pt", "np", "pil"]: deprecation_message = ( f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index ae5616d1a3f3..ae56ae22feb4 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -21,7 +21,7 @@ from ...image_processor import VaeImageProcessor from ...pipelines import DiffusionPipeline from ...schedulers import LMSDiscreteScheduler -from ...utils import is_accelerate_available, is_accelerate_version, deprecate, logging, randn_tensor +from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor from . import StableDiffusionPipelineOutput diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 038a66443826..298277f0e4d7 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -23,7 +23,7 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import EulerDiscreteScheduler -from ...utils import is_accelerate_available, deprecate, logging, randn_tensor +from ...utils import deprecate, is_accelerate_available, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index c9770f0d2a33..76e1318ea145 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -20,7 +20,14 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler, PNDMScheduler -from ...utils import is_accelerate_available, is_accelerate_version, deprecate, logging, randn_tensor, replace_example_docstring +from ...utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 26e6632f06ce..7c7fec032e51 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -36,12 +36,12 @@ from ...utils import ( PIL_INTERPOLATION, BaseOutput, + deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring, - deprecate ) from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput @@ -1257,7 +1257,7 @@ def invert( # 9. Convert to PIL. if output_type == "pil": - image = self.image_processor.postprocess(image, output_type='pil') + image = self.image_processor.postprocess(image, output_type="pil") if not return_dict: return (inverted_latents, image) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index 1b87fab8ad56..e439b1688632 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -22,7 +22,14 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import is_accelerate_available, is_accelerate_version, deprecate, logging, randn_tensor, replace_example_docstring +from ...utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 411fc71a4da3..9b33a8edb09d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -588,7 +588,7 @@ def __call__( ) deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) output_type = "np" - + if output_type == "latent": image = latents else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index 06e8612bedec..162053488529 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -23,7 +23,7 @@ from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers -from ...utils import is_accelerate_available, logging, randn_tensor, replace_example_docstring, deprecate +from ...utils import deprecate, is_accelerate_available, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 13cd1d4494f5..d9b417038e6d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -25,7 +25,7 @@ from ...models import AutoencoderKL, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers -from ...utils import logging, randn_tensor, replace_example_docstring +from ...utils import deprecate, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer @@ -774,7 +774,7 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 9. Post-processing + # 9. Post-processing if output_type not in ["latent", "pt", "np", "pil"]: deprecation_message = ( f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index 83aa2e54cd85..ad450b41c6c2 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -29,7 +29,7 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import is_accelerate_available, deprecate, logging, randn_tensor +from ...utils import deprecate, is_accelerate_available, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .modeling_text_unet import UNetFlatConditionModel @@ -583,9 +583,9 @@ def __call__( if output_type == "latent": image = latents - image = self.decode_latents(latents) - - image = self.image_processor.postprocess(image, output_type=output_type) + else: + image = self.decode_latents(latents) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index 6b362eaa2510..f03d0aa21e89 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -24,7 +24,7 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import is_accelerate_available, logging, randn_tensor +from ...utils import deprecate, is_accelerate_available, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput @@ -425,9 +425,9 @@ def __call__( if output_type == "latent": image = latents - image = self.decode_latents(latents) - - image = self.image_processor.postprocess(image, output_type=output_type) + else: + image = self.decode_latents(latents) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index 8da929e55e1c..c51b904e070b 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -22,7 +22,7 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import is_accelerate_available, logging, randn_tensor +from ...utils import deprecate, is_accelerate_available, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .modeling_text_unet import UNetFlatConditionModel @@ -499,9 +499,9 @@ def __call__( if output_type == "latent": image = latents - image = self.decode_latents(latents) - - image = self.image_processor.postprocess(image, output_type=output_type) + else: + image = self.decode_latents(latents) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) From 83da056808995493f13bbc72e21572bbb22aa579 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 31 Mar 2023 21:35:58 +0000 Subject: [PATCH 03/25] make style --- .../pipeline_semantic_stable_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index 42af7137a9d1..28dbdbe8289f 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -705,7 +705,7 @@ def __call__( image = self.decode_latents(latents) if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embedding.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype) else: has_nsfw_concept = False From d91bcc94160f43e25c12d3747cf0b49993c4ebd3 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Thu, 6 Apr 2023 08:12:27 +0000 Subject: [PATCH 04/25] refactor post-processing --- src/diffusers/image_processor.py | 14 ++++++ .../pipeline_stable_diffusion.py | 45 +++++++++---------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 80e3412991cf..ee161511f161 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -164,6 +164,20 @@ def postprocess( image, output_type: str = "pil", ): + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if output_type == "latent": + return image + + if image.min() < 0: + image = (image / 2 + 0.5).clamp(0, 1) + if isinstance(image, torch.Tensor) and output_type == "pt": return image diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 80d313415427..c5e96d737ac7 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -422,15 +422,24 @@ def _encode_prompt( return prompt_embeds - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = None + else: + image = self.image_processor.postprocess(image, output_type='pt') + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) @@ -691,28 +700,14 @@ def __call__( progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) + + if not output_type == "latent": - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None - - else: - image = self.decode_latents(latents) + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: From 6cbd1acc172d08df2cc61332b7286d1f725d6b54 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Fri, 7 Apr 2023 22:23:39 +0000 Subject: [PATCH 05/25] add pipelinelatenttestermixin --- src/diffusers/image_processor.py | 12 +++- .../pipeline_stable_diffusion.py | 2 +- tests/test_pipelines_common.py | 72 +++++++++++++++++++ 3 files changed, 83 insertions(+), 3 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index ee161511f161..aeace4c07bf7 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -21,7 +21,7 @@ from PIL import Image from .configuration_utils import ConfigMixin, register_to_config -from .utils import CONFIG_NAME, PIL_INTERPOLATION +from .utils import deprecate, CONFIG_NAME, PIL_INTERPOLATION class VaeImageProcessor(ConfigMixin): @@ -82,11 +82,19 @@ def numpy_to_pt(images): @staticmethod def pt_to_numpy(images): """ - Convert a numpy image to a pytorch tensor + Convert a pytorch tensor to a numpy image """ images = images.cpu().permute(0, 2, 3, 1).float().numpy() return images + def pt_to_pil(images): + """ + Convert a pytorch tensor to a pil image + """ + images = self.pt_to_numpy(images) + images = self.numpy_to_pil(images) + return images + @staticmethod def normalize(images): """ diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index c5e96d737ac7..87098f51c00a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -435,7 +435,7 @@ def run_safety_checker(self, image, device, dtype, output_type='pil'): return image, has_nsfw_concept def decode_latents(self, latents): - warnings.warn( + warnings.warn( "The decode_latents method is deprecated and will be removed in a future version. Please" " use VaeImageProcessor instead", FutureWarning, diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index 13fbe924c799..dbe51bc70c17 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -12,6 +12,7 @@ import diffusers from diffusers import DiffusionPipeline +from diffusers.image_processor import VaeImageProcessor from diffusers.utils import logging from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available from diffusers.utils.testing_utils import require_torch, torch_device @@ -26,6 +27,77 @@ def to_np(tensor): return tensor +class PipelineLatentTesterMixin: + """ + This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. + It provides a set of common tests for PyTorch pipeline that has vae, e.g. + equivalence of different input and output types, etc. + """ + + @property + def image_params(self) -> frozenset: + raise NotImplementedError( + "You need to set the attribute `image_params` in the child test class. " + "`image_params` are tested for if all accepted input image types (i.e. `pt`,`pil`,`np`) are producing same results" + ) + + def get_dummy_inputs_by_type(self, device, seed=0, input_image_type='pt', output_type='np'): + + inputs = self.get_dummy_inputs(device, seed) + + def convert_pt_to_type(image, input_image_type): + if input_image_type == "pt": + input_image = image + elif input_image_type == "np": + input_image = VaeImageProcessor.pt_to_numpy(image) + elif input_image_type == "pil": + input_image = VaeImageProcessor.pt_to_pil(image) + else: + raise ValueError(f"unsupported input_image_type {input_image_type}.") + return input_image + + for image_param in self.image_params: + if image_param in inputs.keys(): + inputs[image_param] = convert_pt_to_type(inputs[image_param], input_image_type) + + inputs['output_type'] = output_type + + return inputs + + def test_pt_np_pil_outputs_equivalent(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + output_pt = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="pt"))[0] + output_np = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="np"))[0] + output_pil = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="pil"))[0] + + max_diff = np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() + self.assertLess(max_diff, 1e-4, "`output_type=='pt'` generate different results from `output_type=='np'`") + + max_diff = np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() + self.assertLess(max_diff, 1e-4, "`output_type=='pil'` generate different results from `output_type=='np'`") + + def test_pt_np_pil_inputs_equivalent(self): + if len(self.image_params) == 0: + return + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + out_input_pt = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pt"))[0] + out_input_np = pipe(**self.get_dummy_inputs(torch_device, input_image_type="np"))[0] + out_input_pil = pipe(**self.get_dummy_inputs(torch_device, input_image_type="pil"))[0] + + max_diff = np.abs(out_input_pt - out_input_np).max() + self.assertLess(max_diff, 1e-4, "`input_type=='pt'` generate different result from `input_type=='np'`") + max_diff = np.abs(out_inpput_pil - out_input_np).max() + self.assertLess(max_diff, 1e-2, "`input_type=='pt'` generate different result from `input_type=='np'`") + @require_torch class PipelineTesterMixin: From c6d2405b91e1c3f1f91e6c796fc7ef8d544fe6ba Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 8 Apr 2023 21:20:32 +0000 Subject: [PATCH 06/25] update stablediffusionpipeline fast test using pipelinelatenttestermixim --- tests/pipelines/stable_diffusion/test_stable_diffusion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 857122782d35..c28db6671860 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -47,10 +47,11 @@ torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = frozenset([]) def get_dummy_components(self): torch.manual_seed(0) From fe8e13ec64f98f2185ca40e2b5f34585ac64403a Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 8 Apr 2023 19:55:21 -1000 Subject: [PATCH 07/25] fix --- src/diffusers/image_processor.py | 14 ++------- .../pipeline_stable_diffusion.py | 12 ++++---- .../pipeline_stable_unclip.py | 9 +++++- .../stable_diffusion/test_stable_diffusion.py | 2 +- tests/test_pipelines_common.py | 29 ++++++++++--------- 5 files changed, 34 insertions(+), 32 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index aeace4c07bf7..a8bc697557b9 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -21,7 +21,7 @@ from PIL import Image from .configuration_utils import ConfigMixin, register_to_config -from .utils import deprecate, CONFIG_NAME, PIL_INTERPOLATION +from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate class VaeImageProcessor(ConfigMixin): @@ -87,14 +87,6 @@ def pt_to_numpy(images): images = images.cpu().permute(0, 2, 3, 1).float().numpy() return images - def pt_to_pil(images): - """ - Convert a pytorch tensor to a pil image - """ - images = self.pt_to_numpy(images) - images = self.numpy_to_pil(images) - return images - @staticmethod def normalize(images): """ @@ -179,10 +171,10 @@ def postprocess( ) deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) output_type = "np" - + if output_type == "latent": return image - + if image.min() < 0: image = (image / 2 + 0.5).clamp(0, 1) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 87098f51c00a..14f3a206483b 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import torch @@ -422,11 +423,11 @@ def _encode_prompt( return prompt_embeds - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": - has_nsfw_concept = None + has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type='pt') + image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( @@ -443,6 +444,8 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def prepare_extra_step_kwargs(self, generator, eta): @@ -700,9 +703,8 @@ def __call__( progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) - - if not output_type == "latent": + if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor).sample image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index 46c2a0b7919e..d5c41e34a07d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -24,7 +24,14 @@ from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers -from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring +from ...utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index c28db6671860..0dbb466d115d 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -41,7 +41,7 @@ from ...models.test_models_unet_2d_condition import create_lora_layers from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index dbe51bc70c17..7e96882fb9eb 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -27,13 +27,14 @@ def to_np(tensor): return tensor + class PipelineLatentTesterMixin: - """ + """ This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. It provides a set of common tests for PyTorch pipeline that has vae, e.g. equivalence of different input and output types, etc. """ - + @property def image_params(self) -> frozenset: raise NotImplementedError( @@ -41,8 +42,7 @@ def image_params(self) -> frozenset: "`image_params` are tested for if all accepted input image types (i.e. `pt`,`pil`,`np`) are producing same results" ) - def get_dummy_inputs_by_type(self, device, seed=0, input_image_type='pt', output_type='np'): - + def get_dummy_inputs_by_type(self, device, seed=0, input_image_type="pt", output_type="np"): inputs = self.get_dummy_inputs(device, seed) def convert_pt_to_type(image, input_image_type): @@ -51,7 +51,8 @@ def convert_pt_to_type(image, input_image_type): elif input_image_type == "np": input_image = VaeImageProcessor.pt_to_numpy(image) elif input_image_type == "pil": - input_image = VaeImageProcessor.pt_to_pil(image) + input_image = VaeImageProcessor.pt_to_numpy(image) + input_image = VaeImageProcessor.numpy_to_pil(input_image) else: raise ValueError(f"unsupported input_image_type {input_image_type}.") return input_image @@ -59,8 +60,8 @@ def convert_pt_to_type(image, input_image_type): for image_param in self.image_params: if image_param in inputs.keys(): inputs[image_param] = convert_pt_to_type(inputs[image_param], input_image_type) - - inputs['output_type'] = output_type + + inputs["output_type"] = output_type return inputs @@ -74,15 +75,15 @@ def test_pt_np_pil_outputs_equivalent(self): output_np = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="np"))[0] output_pil = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="pil"))[0] - max_diff = np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() + max_diff = np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() self.assertLess(max_diff, 1e-4, "`output_type=='pt'` generate different results from `output_type=='np'`") - + max_diff = np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() self.assertLess(max_diff, 1e-4, "`output_type=='pil'` generate different results from `output_type=='np'`") - + def test_pt_np_pil_inputs_equivalent(self): if len(self.image_params) == 0: - return + return components = self.get_dummy_components() pipe = self.pipeline_class(**components) @@ -90,12 +91,12 @@ def test_pt_np_pil_inputs_equivalent(self): pipe.set_progress_bar_config(disable=None) out_input_pt = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pt"))[0] - out_input_np = pipe(**self.get_dummy_inputs(torch_device, input_image_type="np"))[0] - out_input_pil = pipe(**self.get_dummy_inputs(torch_device, input_image_type="pil"))[0] + out_input_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0] + out_input_pil = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pil"))[0] max_diff = np.abs(out_input_pt - out_input_np).max() self.assertLess(max_diff, 1e-4, "`input_type=='pt'` generate different result from `input_type=='np'`") - max_diff = np.abs(out_inpput_pil - out_input_np).max() + max_diff = np.abs(out_input_pil - out_input_np).max() self.assertLess(max_diff, 1e-2, "`input_type=='pt'` generate different result from `input_type=='np'`") From 4b09a20f62b32febe85ef20696f21eef1b199cae Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 8 Apr 2023 20:09:56 -1000 Subject: [PATCH 08/25] alt --- .../alt_diffusion/pipeline_alt_diffusion.py | 49 +++++++++---------- tests/pipeline_params.py | 2 + .../altdiffusion/test_alt_diffusion.py | 7 +-- .../stable_diffusion/test_stable_diffusion.py | 4 +- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index ccb8cd6bc34a..cda6f1808ff8 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import torch @@ -419,18 +420,31 @@ def _encode_prompt( return prompt_embeds - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type="pil"): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept def decode_latents(self, latents): + warnings.warn( + ( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead" + ), + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def prepare_extra_step_kwargs(self, generator, eta): @@ -689,27 +703,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/tests/pipeline_params.py b/tests/pipeline_params.py index a0ac6c641c0b..fd9be04d0525 100644 --- a/tests/pipeline_params.py +++ b/tests/pipeline_params.py @@ -22,6 +22,8 @@ TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"]) +TEXT_TO_IMAGE_IMAGE_PARAMS = frozenset([]) + IMAGE_VARIATION_PARAMS = frozenset( [ "image", diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py index faa56e18f748..08949e940b53 100644 --- a/tests/pipelines/altdiffusion/test_alt_diffusion.py +++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py @@ -28,17 +28,18 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class AltDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = AltDiffusionPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 0dbb466d115d..4f24c6b96135 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -40,7 +40,7 @@ from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu from ...models.test_models_unet_2d_condition import create_lora_layers -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin @@ -51,7 +51,7 @@ class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTester pipeline_class = StableDiffusionPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = frozenset([]) + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) From ce19bc9ae975b4008c32a20254b4233aecef7da1 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sun, 9 Apr 2023 12:25:27 -1000 Subject: [PATCH 09/25] refactor all pipelines --- .../pipeline_paint_by_example.py | 39 +++++++------ .../pipeline_semantic_stable_diffusion.py | 47 +++++++-------- .../pipeline_cycle_diffusion.py | 39 +++++++------ ...line_stable_diffusion_attend_and_excite.py | 47 +++++++-------- .../pipeline_stable_diffusion_controlnet.py | 47 +++++++-------- .../pipeline_stable_diffusion_depth2img.py | 40 ++++++------- ...peline_stable_diffusion_image_variation.py | 48 +++++++--------- .../pipeline_stable_diffusion_img2img.py | 47 +++++++-------- .../pipeline_stable_diffusion_inpaint.py | 47 +++++++-------- ...ipeline_stable_diffusion_inpaint_legacy.py | 47 +++++++-------- ...eline_stable_diffusion_instruct_pix2pix.py | 47 +++++++-------- .../pipeline_stable_diffusion_k_diffusion.py | 48 +++++++--------- ...ipeline_stable_diffusion_latent_upscale.py | 23 ++++---- ...pipeline_stable_diffusion_model_editing.py | 38 ++++++------- .../pipeline_stable_diffusion_panorama.py | 47 +++++++-------- .../pipeline_stable_diffusion_pix2pix_zero.py | 57 +++++++++---------- .../pipeline_stable_diffusion_sag.py | 47 +++++++-------- .../pipeline_stable_diffusion_upscale.py | 25 ++++---- .../pipeline_stable_unclip.py | 24 ++++---- .../pipeline_stable_unclip_img2img.py | 23 ++++---- ...ipeline_versatile_diffusion_dual_guided.py | 24 ++++---- ...ine_versatile_diffusion_image_variation.py | 25 ++++---- ...eline_versatile_diffusion_text_to_image.py | 24 ++++---- tests/pipeline_params.py | 2 + .../paint_by_example/test_paint_by_example.py | 5 +- .../stable_diffusion/test_cycle_diffusion.py | 5 +- .../test_stable_diffusion_controlnet.py | 5 +- .../test_stable_diffusion_image_variation.py | 6 +- .../test_stable_diffusion_img2img.py | 57 +++---------------- ...st_stable_diffusion_instruction_pix2pix.py | 5 +- .../test_stable_diffusion_model_editing.py | 7 ++- .../test_stable_diffusion_panorama.py | 7 ++- .../test_stable_diffusion_pix2pix_zero.py | 5 +- .../test_stable_diffusion_sag.py | 7 ++- ...test_stable_diffusion_attend_and_excite.py | 7 ++- .../test_stable_diffusion_depth.py | 5 +- .../test_stable_diffusion_inpaint.py | 5 +- .../test_stable_diffusion_latent_upscale.py | 5 +- .../stable_unclip/test_stable_unclip.py | 7 ++- .../test_stable_unclip_img2img.py | 6 +- 40 files changed, 490 insertions(+), 556 deletions(-) diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index 1cd73e376519..04dfb83c97bd 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import numpy as np @@ -227,12 +228,16 @@ def _execution_device(self): return self.device # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -255,9 +260,16 @@ def prepare_extra_step_kwargs(self, generator, eta): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs @@ -566,19 +578,12 @@ def __call__( deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) output_type = "np" - if output_type == "latent": - image = latents - has_nsfw_concept = None + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index 8eed72be4d24..aa11de425d19 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -1,4 +1,5 @@ import inspect +import warnings from itertools import repeat from typing import Callable, List, Optional, Union @@ -134,19 +135,30 @@ def __init__( self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -689,27 +701,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if output_type == "latent": - image = latents - has_nsfw_concept = None - - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 5a587407380a..ebf07a1b288e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import numpy as np @@ -506,19 +507,30 @@ def prepare_extra_step_kwargs(self, generator, eta): return extra_step_kwargs # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps @@ -776,19 +788,12 @@ def __call__( deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) output_type = "np" - if output_type == "latent": - image = latents - has_nsfw_concept = None + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index d41b26ed1b99..1a9529cbc67d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings import math from typing import Any, Callable, Dict, List, Optional, Union @@ -450,19 +451,30 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -974,27 +986,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if output_type == "latent": - image = latents - has_nsfw_concept = None - - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py index a63c13bb4bdb..2646865ab9db 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py @@ -14,6 +14,7 @@ import inspect +import warnings import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -463,19 +464,30 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -973,27 +985,12 @@ def __call__( self.controlnet.to("cpu") torch.cuda.empty_cache() - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if output_type == "latent": - image = latents - has_nsfw_concept = None - - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 532e6e800a7a..eff62d50fae6 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -14,6 +14,7 @@ import contextlib import inspect +import warnings from typing import Callable, List, Optional, Union import numpy as np @@ -308,19 +309,30 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -684,20 +696,10 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if output_type == "latent": - image = latents - - else: - image = self.decode_latents(latents) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index 486338ef9be3..8bce052452a7 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import PIL @@ -184,19 +185,30 @@ def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free return image_embeddings # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -396,28 +408,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None - - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype) - else: - has_nsfw_concept = False + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - image = self.image_processor.postprocess(image, output_type=output_type) + image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 64fdec838703..e6e19e603234 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import numpy as np @@ -434,19 +435,30 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -731,27 +743,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 8034f880fa11..9dcf3786c036 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import numpy as np @@ -489,12 +490,16 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -517,9 +522,16 @@ def prepare_extra_step_kwargs(self, generator, eta): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs @@ -885,27 +897,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index d113b4917e46..09bb5caa118c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import numpy as np @@ -425,19 +426,30 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -706,27 +718,12 @@ def __call__( # use original latents corresponding to unmasked portions of the image latents = (init_latents_orig * mask) + (latents * (1 - mask)) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index b897f55a7b6d..69478009c949 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import numpy as np @@ -379,27 +380,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None - - else: - image = self.decode_latents(latents) + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: @@ -632,12 +618,16 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -660,9 +650,16 @@ def prepare_extra_step_kwargs(self, generator, eta): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def check_inputs( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index dc878ad084f5..35d4a0c23cd0 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -13,6 +13,7 @@ # limitations under the License. import importlib +import warnings from typing import Callable, List, Optional, Union import torch @@ -347,19 +348,30 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def check_inputs(self, prompt, height, width, callback_steps): @@ -547,28 +559,12 @@ def model_fn(x, t): # 8. Run k-diffusion solver latents = self.sampler(model_fn, latents, sigmas) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None - - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - image = self.image_processor.postprocess(image, output_type=output_type) + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.final_offload_hook.offload() diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 9307515c3ccc..6fd0febe61dc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -14,6 +14,7 @@ from typing import Callable, List, Optional, Union +import warnings import numpy as np import PIL import torch @@ -223,9 +224,16 @@ def _encode_prompt(self, prompt, device, do_classifier_free_guidance, negative_p # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def check_inputs(self, prompt, image, callback_steps): @@ -506,19 +514,10 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if output_type == "latent": - image = latents - else: - image = self.decode_latents(latents) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index d841bd8a2d26..bc463b9b2190 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -13,6 +13,7 @@ import copy import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import torch @@ -372,18 +373,25 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - if self.safety_checker is not None: - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( images=image, clip_input=safety_checker_input.pixel_values.to(dtype) ) - else: - has_nsfw_concept = None return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) @@ -767,24 +775,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type == "latent": - image = latents - has_nsfw_concept = None - elif output_type == "pil": - # 8. Post-processing - image = self.decode_latents(latents) - - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - # 10. Convert to PIL - image = self.numpy_to_pil(image) - else: - # 8. Post-processing - image = self.decode_latents(latents) + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index de316e723031..2bae1fc45154 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -12,6 +12,7 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import torch @@ -345,19 +346,30 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -662,27 +674,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if output_type == "latent": - image = latents - has_nsfw_concept = None - - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 738e86f8b17d..48cf55f0e926 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Union @@ -579,19 +580,30 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -1040,28 +1052,12 @@ def __call__( if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False - - image = self.image_processor.postprocess(image, output_type=output_type) + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.final_offload_hook.offload() @@ -1258,16 +1254,15 @@ def invert( inverted_latents = latents.detach().clone() # 8. Post-processing - image = self.decode_latents(latents.detach()) + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample + + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.final_offload_hook.offload() - # 9. Convert to PIL. - if output_type == "pil": - image = self.image_processor.postprocess(image, output_type="pil") - if not return_dict: return (inverted_latents, image) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index 4b1fa2a9afe9..b0e18554269e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import torch @@ -362,19 +363,30 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type='pil'): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -687,27 +699,12 @@ def get_map_size(module, input, output): if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if output_type == "latent": - image = latents - has_nsfw_concept = None - - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image, has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 430166768454..ff433dedd553 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import numpy as np @@ -322,9 +323,16 @@ def prepare_extra_step_kwargs(self, generator, eta): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def check_inputs( @@ -644,21 +652,12 @@ def __call__( callback(i, t, latents) # 10. Post-processing - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - else: + if not output_type == "latent": # make sure the VAE is in float32 mode, as it overflows in float16 self.vae.to(dtype=torch.float32) - image = self.decode_latents(latents.float()) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample + + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index d5c41e34a07d..b8548f3261bc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch @@ -483,9 +484,16 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs with prepare_extra_step_kwargs->prepare_prior_extra_step_kwargs, scheduler->prior_scheduler @@ -923,20 +931,10 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index adfccbac4b2d..290e11d80277 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import PIL @@ -431,9 +432,16 @@ def _encode_image( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -813,19 +821,10 @@ def __call__( callback(i, t, latents) # 9. Post-processing - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - if output_type == "latent": - image = latents - else: - image = self.decode_latents(latents) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index f4c1d131bf19..b3237b95eccb 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Tuple, Union import numpy as np @@ -331,9 +332,16 @@ def normalize_embeddings(encoder_output): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -572,20 +580,10 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index 4a39d0f7ad4d..f3d770bd7c3d 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import numpy as np @@ -191,9 +192,16 @@ def normalize_embeddings(encoder_output): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -414,21 +422,10 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - - else: - image = self.decode_latents(latents) - image = self.image_processor.postprocess(image, output_type=output_type) + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index e9c03dffa846..c05e364f1976 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import torch @@ -248,9 +249,16 @@ def normalize_embeddings(encoder_output): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): + warnings.warn( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead", + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs @@ -488,20 +496,10 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: return (image,) diff --git a/tests/pipeline_params.py b/tests/pipeline_params.py index fd9be04d0525..e2b68849e254 100644 --- a/tests/pipeline_params.py +++ b/tests/pipeline_params.py @@ -24,6 +24,8 @@ TEXT_TO_IMAGE_IMAGE_PARAMS = frozenset([]) +IMAGE_TO_IMAGE_IMAGE_PARAMS = frozenset(["image"]) + IMAGE_VARIATION_PARAMS = frozenset( [ "image", diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py index 81d1989200ac..a0c569802c16 100644 --- a/tests/pipelines/paint_by_example/test_paint_by_example.py +++ b/tests/pipelines/paint_by_example/test_paint_by_example.py @@ -28,16 +28,17 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class PaintByExamplePipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = PaintByExamplePipeline params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS + image_params = frozenset([]) # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py index 5282cfd8dd24..638cb237b209 100644 --- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py @@ -26,13 +26,13 @@ from diffusers.utils.testing_utils import require_torch_gpu, skip_mps from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class CycleDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = CycleDiffusionPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - { "negative_prompt", @@ -42,6 +42,7 @@ class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): } required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"}) + image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py index d556e6318f43..5b6dccfb61d8 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py @@ -34,13 +34,14 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin -class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionControlNetPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py index 01c2e22e4816..3bca92e76705 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py @@ -33,16 +33,18 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionImageVariationPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionImageVariationPipeline params = IMAGE_VARIATION_PARAMS batch_params = IMAGE_VARIATION_BATCH_PARAMS + image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index e27f83fc04fe..8f7658082896 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -34,18 +34,19 @@ from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS, IMAGE_TO_IMAGE_IMAGE_PARAMS +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) @@ -95,33 +96,19 @@ def get_dummy_components(self): } return components - def get_dummy_inputs(self, device, seed=0, input_image_type="pt", output_type="np"): + def get_dummy_inputs(self, device, seed=0): image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) if str(device).startswith("mps"): generator = torch.manual_seed(seed) else: generator = torch.Generator(device=device).manual_seed(seed) - - if input_image_type == "pt": - input_image = image - elif input_image_type == "np": - input_image = image.cpu().numpy().transpose(0, 2, 3, 1) - elif input_image_type == "pil": - input_image = image.cpu().numpy().transpose(0, 2, 3, 1) - input_image = VaeImageProcessor.numpy_to_pil(input_image) - else: - raise ValueError(f"unsupported input_image_type {input_image_type}.") - - if output_type not in ["pt", "np", "pil"]: - raise ValueError(f"unsupported output_type {output_type}") - inputs = { "prompt": "A painting of a squirrel eating a burger", - "image": input_image, + "image": image, "generator": generator, "num_inference_steps": 2, "guidance_scale": 6.0, - "output_type": output_type, + "output_type": "numpy", } return inputs @@ -216,36 +203,6 @@ def test_save_load_optional_components(self): def test_attention_slicing_forward_pass(self): return super().test_attention_slicing_forward_pass() - @skip_mps - def test_pt_np_pil_outputs_equivalent(self): - device = "cpu" - components = self.get_dummy_components() - sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - output_pt = sd_pipe(**self.get_dummy_inputs(device, output_type="pt"))[0] - output_np = sd_pipe(**self.get_dummy_inputs(device, output_type="np"))[0] - output_pil = sd_pipe(**self.get_dummy_inputs(device, output_type="pil"))[0] - - assert np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4 - assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4 - - @skip_mps - def test_image_types_consistent(self): - device = "cpu" - components = self.get_dummy_components() - sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - output_pt = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pt"))[0] - output_np = sd_pipe(**self.get_dummy_inputs(device, input_image_type="np"))[0] - output_pil = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pil"))[0] - - assert np.abs(output_pt - output_np).max() <= 1e-4 - assert np.abs(output_pil - output_np).max() <= 1e-2 - @slow @require_torch_gpu diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index 25b0c6ea1432..5d0f9213dfb0 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -35,16 +35,17 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionInstructPix2PixPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionInstructPix2PixPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"} batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS + image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py index 2d9b1e54ee6e..d30032e367a1 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -31,18 +31,19 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @skip_mps -class StableDiffusionModelEditingPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionModelEditingPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionModelEditingPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index af26e19cca73..74493402bcfb 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -32,18 +32,19 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @skip_mps -class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionPanoramaPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionPanoramaPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py index 46b93a0589ce..adecde4a0356 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py @@ -34,17 +34,18 @@ from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @skip_mps -class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionPix2PixZeroPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionPix2PixZeroPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess @classmethod def setUpClass(cls): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py index abaefbcad011..ee506ea95e48 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py @@ -29,17 +29,18 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionSAGPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionSAGPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS test_cpu_offload = False def get_dummy_components(self): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py index 780abf304a46..35c6ce9c787b 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py @@ -29,16 +29,17 @@ from diffusers.utils import load_numpy, skip_mps, slow from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin @skip_mps -class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionAttendAndExcitePipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionAttendAndExcitePipeline test_attention_slicing = False params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"}) + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index c2ad239f6888..4abc93d5e97c 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -52,19 +52,20 @@ from diffusers.utils.testing_utils import require_torch_gpu, skip_mps from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @skip_mps -class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionDepth2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionDepth2ImgPipeline test_save_load_optional_components = False params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index ee059314904f..bab8d143dacb 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -27,16 +27,17 @@ from diffusers.utils.testing_utils import require_torch_gpu, slow from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusion2InpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS + image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py index 38f4b053714b..fcc5677139e4 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py @@ -32,13 +32,13 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionLatentUpscalePipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionLatentUpscalePipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - { "height", @@ -49,6 +49,7 @@ class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittes } required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess test_cpu_offload = True @property diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py index 368ab21f24a9..c46e8c2976ca 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip.py @@ -15,14 +15,15 @@ from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS +from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin, assert_mean_pixel_difference -class StableUnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableUnCLIPPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableUnCLIPPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS # TODO(will) Expected attn_bias.stride(1) == 0 to be true, but got false test_xformers_attention = False diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index f93fa3a59014..734888e6d4c6 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -21,15 +21,19 @@ from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS from ...test_pipelines_common import ( + PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference, ) -class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableUnCLIPImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableUnCLIPImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + + def get_dummy_components(self): embedder_hidden_size = 32 From 8065199185d07628df19be62d7ef5cd51ef378c3 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sun, 9 Apr 2023 12:27:47 -1000 Subject: [PATCH 10/25] update alt-img2img --- .../pipeline_alt_diffusion_img2img.py | 49 +++++++++---------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 14e6696db95a..2fac545a9ae1 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Any, Callable, Dict, List, Optional, Union import numpy as np @@ -425,18 +426,31 @@ def _encode_prompt( return prompt_embeds - def run_safety_checker(self, image, device, dtype): - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) - image, has_nsfw_concept = self.safety_checker( - images=image, clip_input=safety_checker_input.pixel_values.to(dtype) - ) + def run_safety_checker(self, image, device, dtype, output_type="pil"): + if self.safety_checker is None or output_type == "latent": + has_nsfw_concept = False + else: + image = self.image_processor.postprocess(image, output_type="pt") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) return image, has_nsfw_concept def decode_latents(self, latents): + warnings.warn( + ( + "The decode_latents method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor instead" + ), + FutureWarning, + ) latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image def prepare_extra_step_kwargs(self, generator, eta): @@ -720,27 +734,12 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) - if output_type not in ["latent", "pt", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) - output_type = "np" - - if output_type == "latent": - image = latents - has_nsfw_concept = None + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - else: - image = self.decode_latents(latents) - - if self.safety_checker is not None: - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - has_nsfw_concept = False + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: From 2c76ca3eec24f749d308d50c5e4d0a4e2f3453fd Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sun, 9 Apr 2023 12:31:05 -1000 Subject: [PATCH 11/25] make style --- .../paint_by_example/pipeline_paint_by_example.py | 6 ++++-- .../pipeline_semantic_stable_diffusion.py | 8 +++++--- .../pipeline_stable_diffusion_attend_and_excite.py | 3 +-- .../pipeline_stable_diffusion_controlnet.py | 5 ++--- .../pipeline_stable_diffusion_depth2img.py | 2 +- .../pipeline_stable_diffusion_image_variation.py | 6 ++++-- .../pipeline_stable_diffusion_img2img.py | 2 +- .../pipeline_stable_diffusion_inpaint.py | 2 +- .../pipeline_stable_diffusion_inpaint_legacy.py | 2 +- .../pipeline_stable_diffusion_instruct_pix2pix.py | 2 +- .../pipeline_stable_diffusion_k_diffusion.py | 4 ++-- .../pipeline_stable_diffusion_latent_upscale.py | 4 ++-- .../pipeline_stable_diffusion_model_editing.py | 2 +- .../pipeline_stable_diffusion_panorama.py | 3 +-- .../pipeline_stable_diffusion_pix2pix_zero.py | 3 +-- .../stable_diffusion/pipeline_stable_diffusion_sag.py | 3 +-- .../stable_diffusion/pipeline_stable_unclip.py | 1 - .../pipeline_stable_unclip_img2img.py | 2 +- .../pipeline_versatile_diffusion_dual_guided.py | 2 +- .../pipeline_versatile_diffusion_image_variation.py | 2 +- .../pipeline_versatile_diffusion_text_to_image.py | 2 +- tests/pipeline_params.py | 2 +- tests/pipelines/altdiffusion/test_alt_diffusion.py | 4 ++-- .../paint_by_example/test_paint_by_example.py | 4 ++-- .../stable_diffusion/test_cycle_diffusion.py | 4 ++-- .../stable_diffusion/test_stable_diffusion.py | 2 +- .../test_stable_diffusion_controlnet.py | 4 ++-- .../test_stable_diffusion_image_variation.py | 11 +++++++---- .../stable_diffusion/test_stable_diffusion_img2img.py | 10 +++++++--- .../test_stable_diffusion_instruction_pix2pix.py | 10 +++++++--- .../test_stable_diffusion_model_editing.py | 4 ++-- .../test_stable_diffusion_panorama.py | 4 ++-- .../test_stable_diffusion_pix2pix_zero.py | 6 ++++-- .../stable_diffusion/test_stable_diffusion_sag.py | 2 +- .../test_stable_diffusion_attend_and_excite.py | 8 +++++--- .../stable_diffusion_2/test_stable_diffusion_depth.py | 6 ++++-- .../test_stable_diffusion_inpaint.py | 6 ++++-- .../test_stable_diffusion_latent_upscale.py | 6 ++++-- tests/pipelines/stable_unclip/test_stable_unclip.py | 4 ++-- .../stable_unclip/test_stable_unclip_img2img.py | 6 +++--- 40 files changed, 95 insertions(+), 74 deletions(-) diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index 04dfb83c97bd..a910f4dc84db 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -228,7 +228,7 @@ def _execution_device(self): return self.device # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: @@ -581,7 +581,9 @@ def __call__( if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype, output_type=output_type) + image, has_nsfw_concept = self.run_safety_checker( + image, device, image_embeddings.dtype, output_type=output_type + ) image = self.image_processor.postprocess(image, output_type=output_type) diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index aa11de425d19..2a5677735633 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -11,7 +11,7 @@ from ...pipeline_utils import DiffusionPipeline from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from ...schedulers import KarrasDiffusionSchedulers -from ...utils import deprecate, logging, randn_tensor +from ...utils import logging, randn_tensor from . import SemanticStableDiffusionPipelineOutput @@ -135,7 +135,7 @@ def __init__( self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: @@ -704,7 +704,9 @@ def __call__( if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype, output_type=output_type) + image, has_nsfw_concept = self.run_safety_checker( + image, self.device, text_embeddings.dtype, output_type=output_type + ) image = self.image_processor.postprocess(image, output_type=output_type) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index 1a9529cbc67d..22aa5a29efba 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -13,8 +13,8 @@ # limitations under the License. import inspect -import warnings import math +import warnings from typing import Any, Callable, Dict, List, Optional, Union import numpy as np @@ -28,7 +28,6 @@ from ...models.attention_processor import Attention from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( - deprecate, is_accelerate_available, is_accelerate_version, logging, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py index 2646865ab9db..f12fd2265703 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py @@ -14,8 +14,8 @@ import inspect -import warnings import os +import warnings from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np @@ -32,7 +32,6 @@ from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( PIL_INTERPOLATION, - deprecate, is_accelerate_available, is_accelerate_version, logging, @@ -464,7 +463,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index eff62d50fae6..2ba9bac08cee 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -309,7 +309,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index 8bce052452a7..f63b9f8ccad7 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -185,7 +185,7 @@ def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free return image_embeddings # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: @@ -411,7 +411,9 @@ def __call__( if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype, output_type=output_type) + image, has_nsfw_concept = self.run_safety_checker( + image, device, image_embeddings.dtype, output_type=output_type + ) image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index e6e19e603234..e7c0abf68d13 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -435,7 +435,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 9dcf3786c036..f9734bbb0073 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -490,7 +490,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 09bb5caa118c..e303a0043a57 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -426,7 +426,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 69478009c949..9e0f8168b177 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -618,7 +618,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index 35d4a0c23cd0..61bee53918c4 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -24,7 +24,7 @@ from ...loaders import TextualInversionLoaderMixin from ...pipelines import DiffusionPipeline from ...schedulers import LMSDiscreteScheduler -from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor +from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor from . import StableDiffusionPipelineOutput @@ -348,7 +348,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 6fd0febe61dc..0571a130dbc4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings from typing import Callable, List, Optional, Union -import warnings import numpy as np import PIL import torch @@ -24,7 +24,7 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import EulerDiscreteScheduler -from ...utils import deprecate, is_accelerate_available, logging, randn_tensor +from ...utils import is_accelerate_available, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index bc463b9b2190..d4178d8d31e9 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -373,7 +373,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index 2bae1fc45154..5770db283345 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -23,7 +23,6 @@ from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler, PNDMScheduler from ...utils import ( - deprecate, is_accelerate_available, is_accelerate_version, logging, @@ -346,7 +345,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 48cf55f0e926..2dbc1cc6656f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -38,7 +38,6 @@ from ...utils import ( PIL_INTERPOLATION, BaseOutput, - deprecate, is_accelerate_available, is_accelerate_version, logging, @@ -580,7 +579,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index b0e18554269e..290b94aafd98 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -25,7 +25,6 @@ from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( - deprecate, is_accelerate_available, is_accelerate_version, logging, @@ -363,7 +362,7 @@ def _encode_prompt( return prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker - def run_safety_checker(self, image, device, dtype, output_type='pil'): + def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index b8548f3261bc..c7fb3e46e844 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -26,7 +26,6 @@ from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( - deprecate, is_accelerate_available, is_accelerate_version, logging, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 290e11d80277..f43de50e963e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -27,7 +27,7 @@ from ...models import AutoencoderKL, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers -from ...utils import deprecate, is_accelerate_version, logging, randn_tensor, replace_example_docstring +from ...utils import is_accelerate_version, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index b3237b95eccb..324b3680c2e7 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -30,7 +30,7 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import deprecate, is_accelerate_available, logging, randn_tensor +from ...utils import is_accelerate_available, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .modeling_text_unet import UNetFlatConditionModel diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index f3d770bd7c3d..eef628fb3955 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -25,7 +25,7 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import deprecate, is_accelerate_available, logging, randn_tensor +from ...utils import is_accelerate_available, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index c05e364f1976..1b7dfe849c7d 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -23,7 +23,7 @@ from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import deprecate, is_accelerate_available, logging, randn_tensor +from ...utils import is_accelerate_available, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .modeling_text_unet import UNetFlatConditionModel diff --git a/tests/pipeline_params.py b/tests/pipeline_params.py index e2b68849e254..7c5ffa2ca24b 100644 --- a/tests/pipeline_params.py +++ b/tests/pipeline_params.py @@ -22,7 +22,7 @@ TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"]) -TEXT_TO_IMAGE_IMAGE_PARAMS = frozenset([]) +TEXT_TO_IMAGE_IMAGE_PARAMS = frozenset([]) IMAGE_TO_IMAGE_IMAGE_PARAMS = frozenset(["image"]) diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py index 08949e940b53..7669d034d0b4 100644 --- a/tests/pipelines/altdiffusion/test_alt_diffusion.py +++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py @@ -28,8 +28,8 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py index a0c569802c16..ac9ba1c2b877 100644 --- a/tests/pipelines/paint_by_example/test_paint_by_example.py +++ b/tests/pipelines/paint_by_example/test_paint_by_example.py @@ -28,7 +28,7 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @@ -38,7 +38,7 @@ class PaintByExamplePipelineFastTests(PipelineLatentTesterMixin, PipelineTesterM pipeline_class = PaintByExamplePipeline params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - image_params = frozenset([]) # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess + image_params = frozenset([]) # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py index 638cb237b209..d9eb1a8d10dc 100644 --- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py @@ -26,7 +26,7 @@ from diffusers.utils.testing_utils import require_torch_gpu, skip_mps from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @@ -42,7 +42,7 @@ class CycleDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterM } required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"}) - image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 4f24c6b96135..b0f1f2a24f4f 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -40,7 +40,7 @@ from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu from ...models.test_models_unet_2d_condition import create_lora_layers -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py index 5b6dccfb61d8..4190dcb371c9 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py @@ -34,14 +34,14 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin class StableDiffusionControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionControlNetPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py index 3bca92e76705..f0aa9db25000 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py @@ -33,18 +33,21 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusionImageVariationPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusionImageVariationPipelineFastTests( + PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionImageVariationPipeline params = IMAGE_VARIATION_PARAMS batch_params = IMAGE_VARIATION_BATCH_PARAMS - image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess - + image_params = frozenset( + [] + ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 8f7658082896..755791c0cd6f 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -34,8 +34,12 @@ from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS, IMAGE_TO_IMAGE_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...pipeline_params import ( + IMAGE_TO_IMAGE_IMAGE_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @@ -46,7 +50,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipelin params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index 5d0f9213dfb0..46ce9a1b69e5 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -35,17 +35,21 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusionInstructPix2PixPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusionInstructPix2PixPipelineFastTests( + PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionInstructPix2PixPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"} batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_params = frozenset( + [] + ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py index d30032e367a1..e7f6bf7023ba 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -31,8 +31,8 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 74493402bcfb..eee02f019eb5 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -32,8 +32,8 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py index adecde4a0356..6df9c92dc956 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py @@ -34,7 +34,7 @@ from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @@ -45,7 +45,9 @@ class StableDiffusionPix2PixZeroPipelineFastTests(PipelineLatentTesterMixin, Pip pipeline_class = StableDiffusionPix2PixZeroPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_params = frozenset( + [] + ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess @classmethod def setUpClass(cls): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py index ee506ea95e48..9fc7d8a84592 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py @@ -29,7 +29,7 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py index 35c6ce9c787b..6835e5db5d09 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py @@ -29,12 +29,14 @@ from diffusers.utils import load_numpy, skip_mps, slow from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin @skip_mps -class StableDiffusionAttendAndExcitePipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusionAttendAndExcitePipelineFastTests( + PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionAttendAndExcitePipeline test_attention_slicing = False params = TEXT_TO_IMAGE_PARAMS diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 4abc93d5e97c..f95814deeec9 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -52,7 +52,7 @@ from diffusers.utils.testing_utils import require_torch_gpu, skip_mps from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @@ -65,7 +65,9 @@ class StableDiffusionDepth2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_params = frozenset( + [] + ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index bab8d143dacb..e373e48d8720 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -27,7 +27,7 @@ from diffusers.utils.testing_utils import require_torch_gpu, slow from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @@ -37,7 +37,9 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineLatentTesterMixin, Pipeli pipeline_class = StableDiffusionInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_params = frozenset( + [] + ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py index fcc5677139e4..6eddb8ba4519 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py @@ -32,7 +32,7 @@ from diffusers.utils.testing_utils import require_torch_gpu from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @@ -49,7 +49,9 @@ class StableDiffusionLatentUpscalePipelineFastTests(PipelineLatentTesterMixin, P } required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_params = frozenset( + [] + ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess test_cpu_offload = True @property diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py index c46e8c2976ca..0fe983d11a59 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip.py @@ -15,8 +15,8 @@ from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, PipelineLatentTesterMixin, assert_mean_pixel_difference +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ...test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference class StableUnCLIPPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index 734888e6d4c6..d7891dc908fb 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -31,9 +31,9 @@ class StableUnCLIPImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTe pipeline_class = StableUnCLIPImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess - - + image_params = frozenset( + [] + ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess def get_dummy_components(self): embedder_hidden_size = 32 From 389fdfe5c19e45eb074f7187eea42e352d31d56f Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sun, 9 Apr 2023 13:00:05 -1000 Subject: [PATCH 12/25] fix model edit pipeline --- .../stable_diffusion/pipeline_stable_diffusion_model_editing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index d4178d8d31e9..ec9ee60f61d6 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -19,6 +19,7 @@ import torch from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import PNDMScheduler @@ -130,6 +131,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.register_to_config(requires_safety_checker=requires_safety_checker) self.with_to_k = with_to_k From db33f87e6d85be477e5e1b2d6243f84ab1ee64b2 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 10 Apr 2023 04:53:48 +0000 Subject: [PATCH 13/25] evaluation model for depth estimator in testing --- .../pipelines/stable_diffusion_2/test_stable_diffusion_depth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index f95814deeec9..b9ffb8acc780 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -135,7 +135,7 @@ def get_dummy_components(self): backbone_config=backbone_config, backbone_featmap_shape=[1, 384, 24, 24], ) - depth_estimator = DPTForDepthEstimation(depth_estimator_config) + depth_estimator = DPTForDepthEstimation(depth_estimator_config).eval() feature_extractor = DPTFeatureExtractor.from_pretrained( "hf-internal-testing/tiny-random-DPTForDepthEstimation" ) From 5cde78c20f544055b1d69f60971179c9a8576b08 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 17 Apr 2023 08:18:25 -1000 Subject: [PATCH 14/25] fix --- src/diffusers/image_processor.py | 3 +- .../pipeline_onnx_stable_diffusion_upscale.py | 32 +++++++++++++++++-- .../pipeline_stable_diffusion.py | 1 - 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index a8bc697557b9..05bdde12f4d0 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -175,8 +175,7 @@ def postprocess( if output_type == "latent": return image - if image.min() < 0: - image = (image / 2 + 0.5).clamp(0, 1) + image = (image / 2 + 0.5).clamp(0, 1) if isinstance(image, torch.Tensor) and output_type == "pt": return image diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index b91262551b0f..18c378a297d7 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -7,7 +7,8 @@ from ...schedulers import DDPMScheduler from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel -from ..pipeline_utils import ImagePipelineOutput +from ...utils import deprecate +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionUpscalePipeline @@ -45,7 +46,7 @@ def preprocess(image): return image -class OnnxStableDiffusionUpscalePipeline(StableDiffusionUpscalePipeline): +class OnnxStableDiffusionUpscalePipeline(DiffusionPipeline): def __init__( self, vae: OnnxRuntimeModel, @@ -56,7 +57,32 @@ def __init__( scheduler: Any, max_noise_level: int = 350, ): - super().__init__(vae, text_encoder, tokenizer, unet, low_res_scheduler, scheduler, max_noise_level) + if hasattr(vae, "config"): + # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate + is_vae_scaling_factor_set_to_0_08333 = ( + hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333 + ) + if not is_vae_scaling_factor_set_to_0_08333: + deprecation_message = ( + "The configuration file of the vae does not contain `scaling_factor` or it is set to" + f" {vae.config.scaling_factor}, which seems highly unlikely. If your checkpoint is a fine-tuned" + " version of `stabilityai/stable-diffusion-x4-upscaler` you should change 'scaling_factor' to" + " 0.08333 Please make sure to update the config accordingly, as not doing so might lead to" + " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging" + " Face Hub, it would be very nice if you could open a Pull Request for the `vae/config.json` file" + ) + deprecate("wrong scaling_factor", "1.0.0", deprecation_message, standard_warn=False) + vae.register_to_config(scaling_factor=0.08333) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + low_res_scheduler=low_res_scheduler, + scheduler=scheduler, + ) + self.register_to_config(max_noise_level=max_noise_level) def __call__( self, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 14f3a206483b..ea21482fb6e0 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -427,7 +427,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( From 928c35b24a9a94532d9e904eba2bdf8d8b5cb9ca Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 17 Apr 2023 14:39:43 -1000 Subject: [PATCH 15/25] style + copy --- .../pipelines/alt_diffusion/pipeline_alt_diffusion.py | 1 - .../alt_diffusion/pipeline_alt_diffusion_img2img.py | 1 - .../pipelines/paint_by_example/pipeline_paint_by_example.py | 1 - .../pipeline_semantic_stable_diffusion.py | 1 - .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py | 1 - .../pipeline_onnx_stable_diffusion_upscale.py | 6 +++--- .../pipeline_stable_diffusion_attend_and_excite.py | 1 - .../pipeline_stable_diffusion_controlnet.py | 1 - .../stable_diffusion/pipeline_stable_diffusion_depth2img.py | 1 - .../pipeline_stable_diffusion_image_variation.py | 1 - .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 1 - .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 1 - .../pipeline_stable_diffusion_inpaint_legacy.py | 1 - .../pipeline_stable_diffusion_instruct_pix2pix.py | 1 - .../pipeline_stable_diffusion_k_diffusion.py | 1 - .../pipeline_stable_diffusion_model_editing.py | 1 - .../stable_diffusion/pipeline_stable_diffusion_panorama.py | 1 - .../pipeline_stable_diffusion_pix2pix_zero.py | 1 - .../stable_diffusion/pipeline_stable_diffusion_sag.py | 1 - tests/pipelines/stable_diffusion/test_stable_diffusion.py | 1 + .../stable_diffusion/test_stable_diffusion_panorama.py | 1 + tests/pipelines/stable_unclip/test_stable_unclip_img2img.py | 2 +- 22 files changed, 6 insertions(+), 22 deletions(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index 6a5d8446e994..dc23c4757d71 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -424,7 +424,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index c5b27d9799c5..3b274e5b186c 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -430,7 +430,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index a910f4dc84db..d059e74676dd 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -232,7 +232,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index 8129dd76cbf0..7a8897207e8c 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -139,7 +139,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 4098cc52776b..17754ba37e9c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -511,7 +511,6 @@ def run_safety_checker(self, image, device, dtype, output_type): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index 92b845261c02..f0f3d30d0ebc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -6,10 +6,10 @@ import torch from ...schedulers import DDPMScheduler -from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel from ...utils import deprecate -from ..pipeline_utils import DiffusionPipeline -from . import StableDiffusionUpscalePipeline +from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel +from ..pipeline_utils import ImagePipelineOutput +from . import DiffusionPipeline logger = getLogger(__name__) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index 2e25d4067cea..54b428f1293c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -454,7 +454,6 @@ def run_safety_checker(self, image, device, dtype, output_type): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py index 0d3a9f22a75a..8399aa0d28ca 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py @@ -469,7 +469,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 56d46e7fa3af..22c6bc09e68c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -313,7 +313,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index f5e24c23e2e0..cf9d791a92a8 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -189,7 +189,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 86035e79c1b9..faa19e199d64 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -439,7 +439,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index f9734bbb0073..ec35a4064fb5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -494,7 +494,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index b554d5ee7ee7..5b0e99002255 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -430,7 +430,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 9e0f8168b177..e5f538093461 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -622,7 +622,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index 6080db189ea7..7ef0caafdee9 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -352,7 +352,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 197a003986fb..abb7052563cd 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -379,7 +379,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index cbeadac8d20a..037936cf17ba 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -349,7 +349,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 9099df1789c1..81260855cbe4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -583,7 +583,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index 24dee0b377ef..17e88b2a482e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -366,7 +366,6 @@ def run_safety_checker(self, image, device, dtype, output_type="pil"): if self.safety_checker is None or output_type == "latent": has_nsfw_concept = False else: - image = self.image_processor.postprocess(image, output_type="pt") feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 491fdf251974..e1d0d198030b 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -43,6 +43,7 @@ from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin + torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 8a97404fdde0..82e42b095f5d 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -35,6 +35,7 @@ from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin + torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index 342e624a4de0..450e0af8dcdc 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -29,7 +29,7 @@ from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS from ..test_pipelines_common import ( - PipelineLatentTesterMixin, + PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference, ) From 809f1fef7e8d16afc681203c696a530ad6f63f28 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 17 Apr 2023 18:19:55 -1000 Subject: [PATCH 16/25] fix tests --- src/diffusers/image_processor.py | 5 +++-- .../pipeline_onnx_stable_diffusion_upscale.py | 3 +-- tests/others/test_image_processor.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index a7073a2ca822..775b532913a6 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -178,8 +178,9 @@ def postprocess( if output_type == "latent": return image - - image = (image / 2 + 0.5).clamp(0, 1) + + if self.config.do_normalize: + image = (image / 2 + 0.5).clamp(0, 1) if isinstance(image, torch.Tensor) and output_type == "pt": return image diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index f0f3d30d0ebc..c55d31e40259 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -8,8 +8,7 @@ from ...schedulers import DDPMScheduler from ...utils import deprecate from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel -from ..pipeline_utils import ImagePipelineOutput -from . import DiffusionPipeline +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput logger = getLogger(__name__) diff --git a/tests/others/test_image_processor.py b/tests/others/test_image_processor.py index 4f0e2c5aecfd..c2cd6f4a04f4 100644 --- a/tests/others/test_image_processor.py +++ b/tests/others/test_image_processor.py @@ -42,7 +42,7 @@ def to_np(self, image): return image def test_vae_image_processor_pt(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) input_pt = self.dummy_sample input_np = self.to_np(input_pt) @@ -59,7 +59,7 @@ def test_vae_image_processor_pt(self): ), f"decoded output does not match input for output_type {output_type}" def test_vae_image_processor_np(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1) for output_type in ["pt", "np", "pil"]: @@ -72,7 +72,7 @@ def test_vae_image_processor_np(self): ), f"decoded output does not match input for output_type {output_type}" def test_vae_image_processor_pil(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1) input_pil = image_processor.numpy_to_pil(input_np) From 9ebd8f91cc554d45a34e298facf40142ee7919bb Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 17 Apr 2023 18:20:55 -1000 Subject: [PATCH 17/25] style --- src/diffusers/image_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 775b532913a6..4d6c5db149de 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -178,7 +178,7 @@ def postprocess( if output_type == "latent": return image - + if self.config.do_normalize: image = (image / 2 + 0.5).clamp(0, 1) From 3dcb7b1439c53c2a1a6385b5e4a0185555772caf Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 17 Apr 2023 18:49:34 -1000 Subject: [PATCH 18/25] fix onnx --- .../pipeline_onnx_stable_diffusion_upscale.py | 114 +++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index c55d31e40259..01edadd4750d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -1,3 +1,4 @@ +import inspect from logging import getLogger from typing import Any, Callable, List, Optional, Union @@ -6,7 +7,7 @@ import torch from ...schedulers import DDPMScheduler -from ...utils import deprecate +from ...utils import deprecate, randn_tensor from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput @@ -403,3 +404,114 @@ def _encode_prompt( prompt_embeds = np.concatenate([uncond_embeddings, prompt_embeds]) return prompt_embeds + + def check_inputs( + self, + prompt, + image, + noise_level, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + if ( + not isinstance(image, torch.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}" + ) + + # verify batch size of prompt and image are same if image is a list or tensor + if isinstance(image, list) or isinstance(image, torch.Tensor): + if isinstance(prompt, str): + batch_size = 1 + else: + batch_size = len(prompt) + if isinstance(image, list): + image_batch_size = len(image) + else: + image_batch_size = image.shape[0] + if batch_size != image_batch_size: + raise ValueError( + f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}." + " Please make sure that passed `prompt` matches the batch size of `image`." + ) + + # check noise level + if noise_level > self.config.max_noise_level: + raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height, width) + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs From fabf88adc55bd09fba815d6e797a703581c67a50 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 17 Apr 2023 19:25:44 -1000 Subject: [PATCH 19/25] fix tests --- .../pipeline_onnx_stable_diffusion_upscale.py | 18 ++++++++++++++++++ .../test_stable_diffusion_img2img.py | 12 ++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index 01edadd4750d..3a3ebe5b3dfb 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -515,3 +515,21 @@ def prepare_extra_step_kwargs(self, generator, eta): if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device \ No newline at end of file diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index da08f25c7fc5..123f5464dfaa 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -121,11 +121,12 @@ def test_stable_diffusion_img2img_default_case(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False) + sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True) sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) + inputs["image"] = inputs["image"] / 2 + 0.5 image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] @@ -138,11 +139,12 @@ def test_stable_diffusion_img2img_negative_prompt(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False) + sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True) sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) + inputs["image"] = inputs["image"] / 2 + 0.5 negative_prompt = "french fries" output = sd_pipe(**inputs, negative_prompt=negative_prompt) image = output.images @@ -157,13 +159,14 @@ def test_stable_diffusion_img2img_multiple_init_images(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False) + sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True) sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) inputs["prompt"] = [inputs["prompt"]] * 2 inputs["image"] = inputs["image"].repeat(2, 1, 1, 1) + inputs["image"] = inputs["image"] / 2 + 0.5 image = sd_pipe(**inputs).images image_slice = image[-1, -3:, -3:, -1] @@ -179,11 +182,12 @@ def test_stable_diffusion_img2img_k_lms(self): beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" ) sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False) + sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True) sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) + inputs["image"] = inputs["image"] / 2 + 0.5 image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] From 04d27fba9952246ca5a13ea365c3a23510a74ba2 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 17 Apr 2023 19:28:56 -1000 Subject: [PATCH 20/25] style --- .../stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index 3a3ebe5b3dfb..fa4c1903cb2a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -532,4 +532,4 @@ def _execution_device(self): and module._hf_hook.execution_device is not None ): return torch.device(module._hf_hook.execution_device) - return self.device \ No newline at end of file + return self.device From 68bb18b086a5db9b06b3e45ed07dacdacd0aef7f Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 17 Apr 2023 19:41:26 -1000 Subject: [PATCH 21/25] fix alt img2img test --- tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py index 144107ec1c97..1f96d8954156 100644 --- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py +++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py @@ -123,6 +123,7 @@ def test_stable_diffusion_img2img_default_case(self): tokenizer.model_max_length = 77 init_image = self.dummy_image.to(device) + init_image = init_image / 2 + 0.5 # make sure here that pndm scheduler skips prk alt_pipe = AltDiffusionImg2ImgPipeline( @@ -134,7 +135,7 @@ def test_stable_diffusion_img2img_default_case(self): safety_checker=None, feature_extractor=self.dummy_extractor, ) - alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False) + alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=True) alt_pipe = alt_pipe.to(device) alt_pipe.set_progress_bar_config(disable=None) From 106c43a499aa8e376dc7f512ce3bc84c2c7feb2b Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Mon, 17 Apr 2023 21:27:48 -1000 Subject: [PATCH 22/25] Update src/diffusers/image_processor.py Co-authored-by: Pedro Cuenca --- src/diffusers/image_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 4d6c5db149de..dcc155fdfbf8 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -170,7 +170,7 @@ def postprocess( ): if output_type not in ["latent", "pt", "np", "pil"]: deprecation_message = ( - f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: " + f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: " "`pil`, `np`, `pt`, `latent`" ) deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) From acf0d605faa2d7749032ef5123fa5b187c6a73a7 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Tue, 18 Apr 2023 11:08:44 -1000 Subject: [PATCH 23/25] Update src/diffusers/image_processor.py Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py Co-authored-by: Patrick von Platen update img2img --- src/diffusers/image_processor.py | 26 ++++++++++++++----- .../pipeline_stable_diffusion.py | 19 +++++++++----- .../pipeline_stable_diffusion_img2img.py | 21 ++++++++++----- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index dcc155fdfbf8..89a160068de6 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -13,7 +13,7 @@ # limitations under the License. import warnings -from typing import Union +from typing import Union, Optional, List import numpy as np import PIL @@ -93,6 +93,13 @@ def normalize(images): Normalize an image array to [-1,1] """ return 2.0 * images - 1.0 + + @staticmethod + def denormalize(images): + """ + Denormalize an image array to [0,1] + """ + return (images / 2 + 0.5).clamp(0, 1) def resize(self, images: PIL.Image.Image) -> PIL.Image.Image: """ @@ -165,9 +172,14 @@ def preprocess( def postprocess( self, - image, + image: torch.FloatTensor, output_type: str = "pil", - ): + do_normalize: Optional[Union[List[bool], bool]] = None, + ): + if not isinstance(image, torch.Tensor): + raise ValueError( + f"Input for postprocess is in incorrect format: {type(image)}. we only support pytorch tensor" + ) if output_type not in ["latent", "pt", "np", "pil"]: deprecation_message = ( f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: " @@ -179,10 +191,12 @@ def postprocess( if output_type == "latent": return image - if self.config.do_normalize: - image = (image / 2 + 0.5).clamp(0, 1) + if not isinstance(do_normalize, list): + do_normalize = image.shape[0] * [do_normalize or self.config.do_normalize] + + image = torch.stack([self.denormalize(image[i]) if do_normalize[i] else image[i] for i in range(image.shape[0])]) - if isinstance(image, torch.Tensor) and output_type == "pt": + if output_type == "pt": return image image = self.pt_to_numpy(image) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 10aef6c73a4d..7daf8f5c6a26 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -423,11 +423,14 @@ def _encode_prompt( return prompt_embeds - def run_safety_checker(self, image, device, dtype, output_type="pil"): - if self.safety_checker is None or output_type == "latent": + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: has_nsfw_concept = False else: - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( images=image, clip_input=safety_checker_input.pixel_values.to(dtype) @@ -705,10 +708,12 @@ def __call__( if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - - image = self.image_processor.postprocess(image, output_type=output_type) + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False + + do_normalize = [not has_nsfw for has_nsfw in has_nsfw_concept] if isinstance(has_nsfw_concept, list) else not has_nsfw_concept + image = self.image_processor.postprocess(image, output_type=output_type, do_normalize=do_normalize) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index faa19e199d64..92d7f5c5af6b 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -436,10 +436,13 @@ def _encode_prompt( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype, output_type="pil"): - if self.safety_checker is None or output_type == "latent": + if self.safety_checker is None: has_nsfw_concept = False else: - feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) image, has_nsfw_concept = self.safety_checker( images=image, clip_input=safety_checker_input.pixel_values.to(dtype) @@ -744,10 +747,16 @@ def __call__( if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor).sample - - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype, output_type=output_type) - - image = self.image_processor.postprocess(image, output_type=output_type) + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + has_nsfw_concept = False + + do_normalize = ( + [not has_nsfw for has_nsfw in has_nsfw_concept] + if isinstance(has_nsfw_concept, list) + else not has_nsfw_concept + ) + image = self.image_processor.postprocess(image, output_type=output_type, do_normalize=do_normalize) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: From f03ff17cfeb1cc254ea59946c72757876b5f1920 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 24 Apr 2023 20:29:20 -1000 Subject: [PATCH 24/25] valueerror not needed --- src/diffusers/image_processor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 89a160068de6..a8e293c998f5 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -203,7 +203,6 @@ def postprocess( if output_type == "np": return image - elif output_type == "pil": + + if output_type == "pil": return self.numpy_to_pil(image) - else: - raise ValueError(f"Unsupported output_type {output_type}.") From 8c4a31bac50cccd04c10f478b3873716b8c7fa31 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 24 Apr 2023 20:29:50 -1000 Subject: [PATCH 25/25] fix output_type=latents --- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 1 + .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 7734de134c17..f8d758fc248a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -718,6 +718,7 @@ def __call__( image = self.vae.decode(latents / self.vae.config.scaling_factor).sample image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) else: + image = latents has_nsfw_concept = False do_normalize = [not has_nsfw for has_nsfw in has_nsfw_concept] if isinstance(has_nsfw_concept, list) else not has_nsfw_concept diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 507aea5ab178..c9df2fffbae6 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -757,6 +757,7 @@ def __call__( image = self.vae.decode(latents / self.vae.config.scaling_factor).sample image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) else: + image = latents has_nsfw_concept = False do_normalize = (