diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py index c7f439fbabb6..804b5ded347e 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py @@ -188,6 +188,9 @@ def __init__( movq=movq, ) + def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): + self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) + def enable_model_cpu_offload(self, gpu_id=0): r""" Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared @@ -198,6 +201,16 @@ def enable_model_cpu_offload(self, gpu_id=0): self.prior_pipe.enable_model_cpu_offload() self.decoder_pipe.enable_model_cpu_offload() + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗 Accelerate, significantly reducing memory usage. Models are moved to a + `torch.device('meta')` and loaded on a GPU only when their specific submodule's `forward` method is called. + Offloading happens on a submodule basis. Memory savings are higher than using + `enable_model_cpu_offload`, but performance is lower. + """ + self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + def progress_bar(self, iterable=None, total=None): self.prior_pipe.progress_bar(iterable=iterable, total=total) self.decoder_pipe.progress_bar(iterable=iterable, total=total) @@ -398,6 +411,9 @@ def __init__( movq=movq, ) + def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): + self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) + def enable_model_cpu_offload(self, gpu_id=0): r""" Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared @@ -408,6 +424,17 @@ def enable_model_cpu_offload(self, gpu_id=0): self.prior_pipe.enable_model_cpu_offload() self.decoder_pipe.enable_model_cpu_offload() + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + def progress_bar(self, iterable=None, total=None): self.prior_pipe.progress_bar(iterable=iterable, total=total) self.decoder_pipe.progress_bar(iterable=iterable, total=total) @@ -630,6 +657,9 @@ def __init__( movq=movq, ) + def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): + self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) + def enable_model_cpu_offload(self, gpu_id=0): r""" Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared @@ -640,6 +670,17 @@ def enable_model_cpu_offload(self, gpu_id=0): self.prior_pipe.enable_model_cpu_offload() self.decoder_pipe.enable_model_cpu_offload() + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + def progress_bar(self, iterable=None, total=None): self.prior_pipe.progress_bar(iterable=iterable, total=total) self.decoder_pipe.progress_bar(iterable=iterable, total=total) diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py index 977a82fdbc9f..6c174d46a55f 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py @@ -177,6 +177,9 @@ def __init__( movq=movq, ) + def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): + self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) + def enable_model_cpu_offload(self, gpu_id=0): r""" Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared @@ -187,6 +190,17 @@ def enable_model_cpu_offload(self, gpu_id=0): self.prior_pipe.enable_model_cpu_offload() self.decoder_pipe.enable_model_cpu_offload() + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + def progress_bar(self, iterable=None, total=None): self.prior_pipe.progress_bar(iterable=iterable, total=total) self.decoder_pipe.progress_bar(iterable=iterable, total=total) @@ -378,6 +392,9 @@ def __init__( movq=movq, ) + def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): + self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) + def enable_model_cpu_offload(self, gpu_id=0): r""" Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared @@ -388,6 +405,17 @@ def enable_model_cpu_offload(self, gpu_id=0): self.prior_pipe.enable_model_cpu_offload() self.decoder_pipe.enable_model_cpu_offload() + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + def progress_bar(self, iterable=None, total=None): self.prior_pipe.progress_bar(iterable=iterable, total=total) self.decoder_pipe.progress_bar(iterable=iterable, total=total) @@ -601,6 +629,9 @@ def __init__( movq=movq, ) + def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): + self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) + def enable_model_cpu_offload(self, gpu_id=0): r""" Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared @@ -611,6 +642,17 @@ def enable_model_cpu_offload(self, gpu_id=0): self.prior_pipe.enable_model_cpu_offload() self.decoder_pipe.enable_model_cpu_offload() + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) + def progress_bar(self, iterable=None, total=None): self.prior_pipe.progress_bar(iterable=iterable, total=total) self.decoder_pipe.progress_bar(iterable=iterable, total=total) diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py index 21c8e78cfade..7629407ab745 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py @@ -51,7 +51,7 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase) "output_type", "return_dict", ] - test_xformers_attention = False + test_xformers_attention = True def get_dummy_components(self): dummy = Dummies() diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py b/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py index 666ea30bd0fd..7591b2347a92 100644 --- a/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py @@ -55,7 +55,7 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa "output_type", "return_dict", ] - test_xformers_attention = False + test_xformers_attention = True def get_dummy_components(self): dummy = Dummies()