PC環境
Windows 11 CUDA 12.4 Python 3.12
Python環境構築
pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124 pip install git+https://github.com/huggingface/diffusers pip install transformers accelerate sentencepiece imageio imageio-ffmpeg beautifulsoup4 ftfy
Pythonスクリプト
import gc import torch from diffusers import AutoencoderKLAllegro, AllegroPipeline from diffusers.utils import export_to_video from decorator import gpu_monitor, time_monitor def flush(): gc.collect() torch.cuda.empty_cache() @gpu_monitor(interval=0.5) @time_monitor def main(): model_id = "rhymes-ai/Allegro" pipe = AllegroPipeline.from_pretrained( model_id, transformer=None, vae=None, torch_dtype=torch.bfloat16, ) pipe.to("cuda") prompt = "A seaside harbor with bright sunlight and sparkling seawater, with many boats in the water. From an aerial view, the boats vary in size and color, some moving and some stationary. Fishing boats in the water suggest that this location might be a popular spot for docking fishing boats." positive_prompt = ( "(masterpiece), (best quality), (ultra-detailed), (unwatermarked), " "{}, " "emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, " "sharp focus, high budget, cinemascope, moody, epic, gorgeous" ) negative_prompt = """ nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry. """ prompt = positive_prompt.format(prompt.lower().strip()) with torch.no_grad(): prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask = pipe.encode_prompt( prompt=prompt, negative_prompt=negative_prompt, max_sequence_length=512 ) print(f"torch.cuda.max_memory_allocated: {torch.cuda.max_memory_allocated()/ 1024**3:.2f} GB") del pipe flush() vae = AutoencoderKLAllegro.from_pretrained( model_id, subfolder="vae", torch_dtype=torch.float32 ) pipe = AllegroPipeline.from_pretrained( model_id, vae=vae, text_encoder=None, tokenizer=None, torch_dtype=torch.bfloat16 ) pipe.to("cuda") pipe.vae.enable_tiling() video = pipe( prompt=None, negative_prompt=None, prompt_embeds=prompt_embeds, prompt_attention_mask=prompt_attention_mask, negative_prompt_embeds=negative_prompt_embeds, negative_prompt_attention_mask=negative_prompt_attention_mask, guidance_scale=7.5, num_inference_steps=100, generator=torch.manual_seed(42) ).frames[0] print(f"torch.cuda.max_memory_allocated: {torch.cuda.max_memory_allocated()/ 1024**3:.2f} GB") del vae del pipe flush() export_to_video(video, "output.mp4", fps=15) if __name__ == "__main__": main()
結果
torch.cuda.max_memory_allocated: 9.09 GB torch.cuda.max_memory_allocated: 16.19 GB time: 6150.23 sec GPU 0 - Used memory: 23.82/23.99 GB
作成した動画は以下のGoogle Bloggerに載せています。
support-touchsp.blogspot.com
その他
ベンチマークはこちらで記述したスクリプトで行いました。touch-sp.hatenablog.com