Skip to content

Commit

Permalink
[wip]
Browse files Browse the repository at this point in the history
  • Loading branch information
zhengzangw committed May 7, 2024
1 parent c00a06a commit fa314ed
Show file tree
Hide file tree
Showing 46 changed files with 90 additions and 88 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ conda activate opensora
pip install torch torchvision

# install flash attention (optional)
# set enable_flashattn=False in config to avoid using flash attention
# set enable_flash_attn=False in config to avoid using flash attention
pip install packaging ninja
pip install flash-attn --no-build-isolation

Expand Down
2 changes: 1 addition & 1 deletion configs/dit/train/16x256x256.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
model = dict(
type="DiT-XL/2",
from_pretrained="DiT-XL-2-256x256.pt",
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/dit/train/1x256x256.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
model = dict(
type="DiT-XL/2",
no_temporal_pos_emb=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/latte/train/16x256x256.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# Define model
model = dict(
type="Latte-XL/2",
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora-v1-1/train/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
input_sq_size=512, # pretrained model is trained on 512x512
qk_norm=True,
qk_norm_legacy=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora-v1-1/train/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
input_sq_size=512, # pretrained model is trained on 512x512
qk_norm=True,
qk_norm_legacy=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
6 changes: 3 additions & 3 deletions configs/opensora-v1-1/train/image_rflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
# type="DiT-XL/2",
# from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
# # input_sq_size=512, # pretrained model is trained on 512x512
# enable_flashattn=True,
# enable_flash_attn=True,
# enable_layernorm_kernel=True,
# )
model = dict(
Expand All @@ -44,7 +44,7 @@
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
# model = dict(
Expand All @@ -54,7 +54,7 @@
# no_temporal_pos_emb=True,
# # from_pretrained="PixArt-XL-2-512x512.pth",
# from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
# enable_flashattn=True,
# enable_flash_attn=True,
# enable_layernorm_kernel=True,
# )
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora-v1-1/train/stage1.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
input_sq_size=512, # pretrained model is trained on 512x512
qk_norm=True,
qk_norm_legacy=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora-v1-1/train/stage2.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
input_sq_size=512, # pretrained model is trained on 512x512
qk_norm=True,
qk_norm_legacy=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora-v1-1/train/stage3.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
input_sq_size=512, # pretrained model is trained on 512x512
qk_norm=True,
qk_norm_legacy=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora-v1-1/train/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
input_sq_size=512, # pretrained model is trained on 512x512
qk_norm=True,
qk_norm_legacy=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora-v1-2/inference/sample-ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
type="STDiT3-XL/2",
from_pretrained=None,
qk_norm=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora-v1-2/inference/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
type="STDiT3-XL/2",
from_pretrained=None,
qk_norm=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora-v1-2/train/stage1-gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
type="STDiT3-XL/2",
from_pretrained=None,
qk_norm=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
4 changes: 2 additions & 2 deletions configs/opensora-v1-2/train/stage1.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@
type="STDiT3-XL/2",
from_pretrained=None,
qk_norm=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderPipeline",
from_pretrained="pretrained_models/vae-v3",
from_pretrained="pretrained_models/vae-v2",
micro_frame_size=17,
vae_2d=dict(
type="VideoAutoencoderKL",
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/inference/16x256x256.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
type="STDiT-XL/2",
space_scale=0.5,
time_scale=1.0,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
from_pretrained="PRETRAINED_MODEL",
)
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/inference/16x512x512-rflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
type="STDiT-XL/2",
space_scale=1.0,
time_scale=1.0,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
from_pretrained="PRETRAINED_MODEL",
)
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/inference/16x512x512.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
type="STDiT-XL/2",
space_scale=1.0,
time_scale=1.0,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
from_pretrained="PRETRAINED_MODEL",
)
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/inference/64x512x512.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
from_pretrained="PRETRAINED_MODEL",
)
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/train/16x256x256-mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
space_scale=0.5,
time_scale=1.0,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
mask_ratios = {
Expand Down
4 changes: 2 additions & 2 deletions configs/opensora/train/16x256x256-spee-rflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
# from_pretrained="PixArt-XL-2-512x512.pth",
# from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth",
# from_pretrained = "OpenSora-v1-HQ-16x512x512.pth",
from_pretrained = "PRETRAINED_MODEL",
enable_flashattn=True,
from_pretrained="PRETRAINED_MODEL",
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
# mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07]
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/train/16x256x256-spee.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
space_scale=0.5,
time_scale=1.0,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
mask_ratios = {
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/train/16x256x256.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
space_scale=0.5,
time_scale=1.0,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/train/16x512x512.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
space_scale=1.0,
time_scale=1.0,
from_pretrained=None,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/train/360x512x512.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
space_scale=1.0,
time_scale=2 / 3,
from_pretrained=None,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
enable_sequence_parallelism=True, # enable sq here
)
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/train/64x512x512-sp.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
space_scale=1.0,
time_scale=2 / 3,
from_pretrained=None,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
enable_sequence_parallelism=True, # enable sq here
)
Expand Down
2 changes: 1 addition & 1 deletion configs/opensora/train/64x512x512.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
space_scale=1.0,
time_scale=2 / 3,
from_pretrained=None,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/pixart/inference/1x20481B.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from_pretrained="PixArt-1B-2.pth",
space_scale=4,
no_temporal_pos_emb=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
base_size=2048 // 8,
)
Expand Down
2 changes: 1 addition & 1 deletion configs/pixart/inference/1x2048MS.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from_pretrained="PixArt-Sigma-XL-2-2K-MS.pth",
space_scale=4,
no_temporal_pos_emb=True,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
base_size=2048 // 8,
)
Expand Down
2 changes: 1 addition & 1 deletion configs/pixart/train/16x256x256.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
space_scale=0.5,
time_scale=1.0,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/pixart/train/1x2048x2048.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
space_scale=4.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-1B-2.pth",
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)

Expand Down
4 changes: 2 additions & 2 deletions configs/pixart/train/1x512x512-rflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
time_scale=1.0,
no_temporal_pos_emb=True,
# from_pretrained="PixArt-XL-2-512x512.pth",
from_pretrained = "PRETRAINED_MODEL",
enable_flashattn=True,
from_pretrained="PRETRAINED_MODEL",
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/pixart/train/1x512x512.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
2 changes: 1 addition & 1 deletion configs/pixart/train/64x512x512.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
space_scale=1.0,
time_scale=2 / 3,
from_pretrained=None,
enable_flashattn=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
)
vae = dict(
Expand Down
6 changes: 3 additions & 3 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ model = dict(
from_pretrained="PRETRAINED_MODEL", # (Optional) Load from pretrained model
input_sq_size=512, # Base spatial position embedding size
qk_norm=True, # Normalize query and key in attention
enable_flashattn=True, # (Optional) Speed up training and inference with flash attention
# Turn enable_flashattn to False if you skip flashattn installation
enable_flash_attn=True, # (Optional) Speed up training and inference with flash attention
# Turn enable_flash_attn to False if you skip flashattn installation
enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
# Turn enable_layernorm_kernel to False if you skip apex installation
)
Expand Down Expand Up @@ -177,7 +177,7 @@ model = dict(
from_pretrained=None, # Load from pretrained model
input_sq_size=512, # Base spatial position embedding size
qk_norm=True, # Normalize query and key in attention
enable_flashattn=True, # (Optional) Speed up training and inference with flash attention
enable_flash_attn=True, # (Optional) Speed up training and inference with flash attention
enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
)
vae = dict(
Expand Down
4 changes: 2 additions & 2 deletions docs/zh_CN/structure.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ model = dict(
type="STDiT-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
space_scale=1.0, # (Optional) Space positional encoding scale (new height / old height)
time_scale=2 / 3, # (Optional) Time positional encoding scale (new frame_interval / old frame_interval)
enable_flashattn=True, # (Optional) Speed up training and inference with flash attention
enable_flash_attn=True, # (Optional) Speed up training and inference with flash attention
enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
from_pretrained="PRETRAINED_MODEL", # (Optional) Load from pretrained model
no_temporal_pos_emb=True, # (Optional) Disable temporal positional encoding (for image)
Expand Down Expand Up @@ -144,7 +144,7 @@ model = dict(
space_scale=1.0,
time_scale=2 / 3,
from_pretrained="YOUR_PRETRAINED_MODEL",
enable_flashattn=True, # Enable flash attention
enable_flash_attn=True, # Enable flash attention
enable_layernorm_kernel=True, # Enable layernorm kernel
)
vae = dict(
Expand Down
Loading

0 comments on commit fa314ed

Please sign in to comment.