PC環境
Windows 11 CUDA 12.4 Python 3.12
Python環境構築
pip install torch==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124 pip install diffusers transformers pip install ftfy imageio-ffmpeg imageio
Pythonスクリプト
import torch from diffusers import AutoencoderKLWan, WanPipeline from diffusers.utils import export_to_video # Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers" vae = AutoencoderKLWan.from_pretrained( model_id, subfolder="vae", torch_dtype=torch.float32 ) pipe = WanPipeline.from_pretrained( model_id, vae=vae, torch_dtype=torch.bfloat16 ) pipe.enable_model_cpu_offload() prompt = "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" generator = torch.Generator(device="cuda").manual_seed(0) output = pipe( prompt=prompt, negative_prompt=negative_prompt, height=480, width=832, num_frames=81, generator=generator ).frames[0] export_to_video(output, "output.mp4", fps=16)
プロンプトは以下です。
Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage.
DeepLで翻訳
着心地の良いボクシングウェアと鮮やかなグローブを身につけた2匹の擬人化された猫が、スポットライトを浴びたステージで激しく戦う。
結果

動画はGoogle Bloggerに載せています。
support-touchsp.blogspot.com
VRAM使用量とかかった時間
RTX 4090で測定しました。「enable_vae_slicing()」と「enable_vae_tiling()」は今のところ使えないようです。
時間によるソート(速い順)
to("cada")
enable_model_cpu_offload()
time: 337.86 sec
memory: 13.96 GB
enable_model_cpu_offload()
time: 340.7 sec
memory: 11.78 GB
to("cada")
enable_sequential_cpu_offload()
time: 389.66 sec
memory: 13.96 GB
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
time: 392.42 sec
memory: 13.96 GB
enable_sequential_cpu_offload()
time: 397.36 sec
memory: 10.27 GB
enable_model_cpu_offload()
enable_sequential_cpu_offload()
time: 399.05 sec
memory: 10.27 GB
to("cada")
time: 441.75 sec
memory: 25.43 GB
VRAM使用量によるソート(少ない順)
enable_model_cpu_offload()
enable_sequential_cpu_offload()
memory: 10.27 GB
time: 399.05 sec
enable_sequential_cpu_offload()
memory: 10.27 GB
time: 397.36 sec
enable_model_cpu_offload()
memory: 11.78 GB
time: 340.7 sec
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
memory: 13.96 GB
time: 392.42 sec
to("cada")
enable_model_cpu_offload()
memory: 13.96 GB
time: 337.86 sec
to("cada")
enable_sequential_cpu_offload()
memory: 13.96 GB
time: 389.66 sec
to("cada")
memory: 25.43 GB
time: 441.75 sec
ベンチマークのためのPythonスクリプト
import torch from diffusers import AutoencoderKLWan, WanPipeline from diffusers.utils import export_to_video from typing import Tuple, TypedDict from itertools import product import gc import time def reset_memory(): gc.collect() torch.cuda.empty_cache() torch.cuda.reset_accumulated_memory_stats() torch.cuda.reset_peak_memory_stats() class ResultDict(TypedDict): memeory: float time_required: float combination: str def main( i: int, combination: Tuple[bool, bool, bool, bool, bool] ) -> ResultDict: if sum(combination[:3]) == 0: return None # Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers" vae = AutoencoderKLWan.from_pretrained( model_id, subfolder="vae", torch_dtype=torch.float32 ) pipe = WanPipeline.from_pretrained( model_id, vae=vae, torch_dtype=torch.bfloat16 ) prompt = "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" try: combination_list = [] if combination[0]: pipe.to("cuda") combination_list.append("to(\"cada\")") if combination[1]: pipe.enable_model_cpu_offload() combination_list.append("enable_model_cpu_offload()") if combination[2]: pipe.enable_sequential_cpu_offload() combination_list.append("enable_sequential_cpu_offload()") if combination[3]: pipe.vae.enable_slicing() combination_list.append("vae.enable_slicing()") if combination[4]: pipe.vae.enable_tiling() combination_list.append("vae.enable_tiling()") start_time = time.time() generator = torch.Generator(device="cuda").manual_seed(0) output = pipe( prompt=prompt, negative_prompt=negative_prompt, height=480, width=832, num_frames=81, generator=generator ).frames[0] export_to_video(output, f"output{i}.mp4", fps=16) end_time = time.time() result: ResultDict = { "memory": round(torch.cuda.max_memory_reserved() / 1024**3, 2), "time_required": round(end_time - start_time, 2), "combination": "\n".join(combination_list) } except Exception as e: print("\n".join(combination_list)) print(e) return None print("succeee!!") print("\n".join(combination_list)) print(f"saved video as output{i}.mp4") return result if __name__=="__main__": combinations = list(product([True, False], repeat=5)) result_list = [] for i, combination in enumerate(combinations): reset_memory() result = main(i, combination) if result is not None: result_list.append(result) print("Sorted by time taken") time_sorted_list = sorted(result_list, key=lambda x: x["time_required"]) for time_sorted in time_sorted_list: print(time_sorted["combination"]) print(f"time: {time_sorted["time_required"]} sec") print(f"memory: {time_sorted["memory"]} GB") print() print("Sorted by memory used") memory_sorted_list = sorted(result_list, key=lambda x: x["memory"]) for memory_sorted in memory_sorted_list: print(memory_sorted["combination"]) print(f"memory: {memory_sorted["memory"]} GB") print(f"time: {memory_sorted["time_required"]} sec") print()