はじめに
「LTX-Video 0.9.7 Distilled」を使って動画作成(Text2Video)を行ってみました。Version 0.9.1の記事はこちらです。touch-sp.hatenablog.com
Version 0.9.5の記事はこちらです。
touch-sp.hatenablog.com
PC環境
Windows 11 RTX 3080 Laptop (VRAM 16GB) CUDA 12.6 Python 3.12
Python環境構築
pip install torch==2.7.0+cu126 --index-url https://download.pytorch.org/whl/cu126 pip install git+https://github.com/huggingface/diffusers pip install accelerate transformers sentencepiece imageio imageio-ffmpeg
accelerate==1.7.0 diffusers @ git+https://github.com/huggingface/diffusers@a5f4cc7f846d3edbd91264675c4e441608cbbb93 imageio==2.37.0 imageio-ffmpeg==0.6.0 sentencepiece==0.2.0 torch==2.7.0+cu126 transformers==4.52.3
結果
VRAM消費量と動画生成時間を測定するためのスクリプトを書いて実行しました。「enable_model_cpu_offload()」と「vae.enable_tiling()」は必須としました。「enable_sequential_cpu_offload()」を追加するとVRAM消費量が激減しました。「vae.enable_slicing()」はほとんど効果ありませんでした。作成時間でソート
enable_model_cpu_offload() enable_sequential_cpu_offload() vae.enable_slicing() vae.enable_tiling() time: 257.19 sec memory: 3.8 GB enable_model_cpu_offload() enable_sequential_cpu_offload() vae.enable_tiling() time: 261.28 sec memory: 3.8 GB enable_model_cpu_offload() vae.enable_tiling() time: 2485.06 sec memory: 25.58 GB enable_model_cpu_offload() vae.enable_slicing() vae.enable_tiling() time: 2579.46 sec memory: 25.58 GB
VRAM使用量でソート
enable_model_cpu_offload() enable_sequential_cpu_offload() vae.enable_slicing() vae.enable_tiling() memory: 3.8 GB time: 257.19 sec enable_model_cpu_offload() enable_sequential_cpu_offload() vae.enable_tiling() memory: 3.8 GB time: 261.28 sec enable_model_cpu_offload() vae.enable_slicing() vae.enable_tiling() memory: 25.58 GB time: 2579.46 sec enable_model_cpu_offload() vae.enable_tiling() memory: 25.58 GB time: 2485.06 sec
作成動画
作成した動画は以下のGoogle Bloggerに載せています。support-touchsp.blogspot.com
Pythonスクリプト
import torch from diffusers import LTXConditionPipeline from diffusers.utils import export_to_video from typing import Tuple, TypedDict from itertools import product import gc import time def reset_memory(): gc.collect() torch.cuda.empty_cache() torch.cuda.reset_accumulated_memory_stats() torch.cuda.reset_peak_memory_stats() class ResultDict(TypedDict): memeory: float time_required: float combination: str def main( i: int, combination: Tuple[bool, bool, bool, bool, bool] ) -> ResultDict: if combination[0] == True: return None if combination[1] == False: return None if combination[4] == False: return None pipe = LTXConditionPipeline.from_pretrained( "Lightricks/LTX-Video-0.9.7-distilled", torch_dtype=torch.bfloat16 ) try: combination_list = [] if combination[0]: pipe.to("cuda") combination_list.append("to(\"cada\")") if combination[1]: pipe.enable_model_cpu_offload() combination_list.append("enable_model_cpu_offload()") if combination[2]: pipe.enable_sequential_cpu_offload() combination_list.append("enable_sequential_cpu_offload()") if combination[3]: pipe.vae.enable_slicing() combination_list.append("vae.enable_slicing()") if combination[4]: pipe.vae.enable_tiling() combination_list.append("vae.enable_tiling()") prompt = "artistic anatomical 3d render, utlra quality, human half full male body with transparent skin revealing structure instead of organs, muscular, intricate creative patterns, monochromatic with backlighting, lightning mesh, scientific concept art, blending biology with botany, surreal and ethereal quality, unreal engine 5, ray tracing, ultra realistic, 16K UHD, rich details. camera zooms out in a rotating fashion" negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" height, width = 480, 832 num_frames = 121 start_time = time.time() video = pipe( prompt=prompt, negative_prompt=negative_prompt, width=width, height=height, num_frames=num_frames, guidance_scale=1.0, num_inference_steps=10, decode_timestep=0.05, decode_noise_scale=0.025, image_cond_noise_scale=0.0, guidance_rescale=0.7, generator=torch.Generator().manual_seed(42), ).frames[0] export_to_video(video, f"output{i}.mp4", fps=24) end_time = time.time() result: ResultDict = { "memory": round(torch.cuda.max_memory_reserved() / 1024**3, 2), "time_required": round(end_time - start_time, 2), "combination": "\n".join(combination_list) } except Exception as e: print("\n".join(combination_list)) print(e) return None print("succeee!!") print("\n".join(combination_list)) print(f"saved video as output{i}.mp4") return result if __name__=="__main__": combinations = list(product([True, False], repeat=5)) result_list = [] for i, combination in enumerate(combinations): reset_memory() result = main(i, combination) if result is not None: result_list.append(result) print("Sorted by time taken") time_sorted_list = sorted(result_list, key=lambda x: x["time_required"]) for time_sorted in time_sorted_list: print(time_sorted["combination"]) print(f"time: {time_sorted["time_required"]} sec") print(f"memory: {time_sorted["memory"]} GB") print() print("Sorted by memory used") memory_sorted_list = sorted(result_list, key=lambda x: x["memory"]) for memory_sorted in memory_sorted_list: print(memory_sorted["combination"]) print(f"memory: {memory_sorted["memory"]} GB") print(f"time: {memory_sorted["time_required"]} sec") print()