Skip to content

Commit

Permalink
a few fix for kandinsky combined pipeline (huggingface#4352)
Browse files Browse the repository at this point in the history
* add xformer

* enable_sequential_cpu_offload

* style

* Update src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py

Co-authored-by: Steven Liu <[email protected]>

---------

Co-authored-by: yiyixuxu <yixu310@gmail,com>
Co-authored-by: Steven Liu <[email protected]>
  • Loading branch information
3 people authored Aug 4, 2023
1 parent 1a8843f commit 29ece0d
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 2 deletions.
41 changes: 41 additions & 0 deletions src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ def __init__(
movq=movq,
)

def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)

def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
Expand All @@ -198,6 +201,16 @@ def enable_model_cpu_offload(self, gpu_id=0):
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()

def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗 Accelerate, significantly reducing memory usage. Models are moved to a
`torch.device('meta')` and loaded on a GPU only when their specific submodule's `forward` method is called.
Offloading happens on a submodule basis. Memory savings are higher than using
`enable_model_cpu_offload`, but performance is lower.
"""
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)

def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total)
self.decoder_pipe.progress_bar(iterable=iterable, total=total)
Expand Down Expand Up @@ -398,6 +411,9 @@ def __init__(
movq=movq,
)

def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)

def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
Expand All @@ -408,6 +424,17 @@ def enable_model_cpu_offload(self, gpu_id=0):
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()

def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)

def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total)
self.decoder_pipe.progress_bar(iterable=iterable, total=total)
Expand Down Expand Up @@ -630,6 +657,9 @@ def __init__(
movq=movq,
)

def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)

def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
Expand All @@ -640,6 +670,17 @@ def enable_model_cpu_offload(self, gpu_id=0):
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()

def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)

def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total)
self.decoder_pipe.progress_bar(iterable=iterable, total=total)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@ def __init__(
movq=movq,
)

def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)

def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
Expand All @@ -187,6 +190,17 @@ def enable_model_cpu_offload(self, gpu_id=0):
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()

def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)

def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total)
self.decoder_pipe.progress_bar(iterable=iterable, total=total)
Expand Down Expand Up @@ -378,6 +392,9 @@ def __init__(
movq=movq,
)

def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)

def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
Expand All @@ -388,6 +405,17 @@ def enable_model_cpu_offload(self, gpu_id=0):
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()

def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)

def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total)
self.decoder_pipe.progress_bar(iterable=iterable, total=total)
Expand Down Expand Up @@ -601,6 +629,9 @@ def __init__(
movq=movq,
)

def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)

def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
Expand All @@ -611,6 +642,17 @@ def enable_model_cpu_offload(self, gpu_id=0):
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()

def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)

def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total)
self.decoder_pipe.progress_bar(iterable=iterable, total=total)
Expand Down
2 changes: 1 addition & 1 deletion tests/pipelines/kandinsky/test_kandinsky_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase)
"output_type",
"return_dict",
]
test_xformers_attention = False
test_xformers_attention = True

def get_dummy_components(self):
dummy = Dummies()
Expand Down
2 changes: 1 addition & 1 deletion tests/pipelines/kandinsky_v22/test_kandinsky_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa
"output_type",
"return_dict",
]
test_xformers_attention = False
test_xformers_attention = True

def get_dummy_components(self):
dummy = Dummies()
Expand Down

0 comments on commit 29ece0d

Please sign in to comment.