はじめに
DiffusersにHiDreamというText2Imageモデルが追加されたのでさっそく試してみました。使用したPC
VRAM使用量が多いようです。RTX 4090を使用しました。OS Windows 11 プロセッサ Core(TM) i7-14700K 実装 RAM 96.0 GB GPU RTX 4090 (VRAM 24GB)
環境構築
pip install torch==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124 pip install git+https://github.com/huggingface/diffusers pip install transformers accelerate pip install sentencepiece
結果

Pythonスクリプト
以下のスクリプトを用いて様々な条件で実行しました。import torch from transformers import PreTrainedTokenizerFast, LlamaForCausalLM from diffusers import HiDreamImagePipeline, HiDreamImageTransformer2DModel from typing import Tuple, TypedDict from itertools import product import gc import time def reset_memory(): gc.collect() torch.cuda.empty_cache() torch.cuda.reset_accumulated_memory_stats() torch.cuda.reset_peak_memory_stats() class ResultDict(TypedDict): memeory: float time_required: float combination: str def main( i: int, combination: Tuple[bool, bool, bool, bool, bool] ) -> ResultDict: if combination[1] == False: # enable_model_cpu_offload() must be true return None if combination[2] == False: # enable_sequential_cpu_offload() must be true return None tokenizer_4 = PreTrainedTokenizerFast.from_pretrained( "meta-llama/Meta-Llama-3.1-8B-Instruct" ) text_encoder_4 = LlamaForCausalLM.from_pretrained( "meta-llama/Meta-Llama-3.1-8B-Instruct", output_hidden_states=True, output_attentions=True, torch_dtype=torch.bfloat16 ) transformer = HiDreamImageTransformer2DModel.from_pretrained( "HiDream-ai/HiDream-I1-Dev", subfolder="transformer", torch_dtype=torch.bfloat16 ) pipe = HiDreamImagePipeline.from_pretrained( "HiDream-ai/HiDream-I1-Dev", tokenizer_4=tokenizer_4, text_encoder_4=text_encoder_4, transformer=transformer, torch_dtype=torch.bfloat16, ) try: combination_list = [] if combination[0]: pipe.to("cuda") combination_list.append("to(\"cada\")") if combination[1]: pipe.enable_model_cpu_offload() combination_list.append("enable_model_cpu_offload()") if combination[2]: pipe.enable_sequential_cpu_offload() combination_list.append("enable_sequential_cpu_offload()") if combination[3]: pipe.enable_vae_slicing() combination_list.append("enable_vae_slicing()") if combination[4]: pipe.enable_vae_tiling() combination_list.append("enable_vae_tiling()") start_time = time.time() image = pipe( 'A cat holding a sign that says "Hi-Dreams.ai".', height=1024, width=1024, guidance_scale=5.0, num_inference_steps=28, generator=torch.Generator("cuda").manual_seed(0), ).images[0] image.save(f"output{i}.png") end_time = time.time() result: ResultDict = { "memory": round(torch.cuda.max_memory_reserved() / 1024**3, 2), "time_required": round(end_time - start_time, 2), "combination": "\n".join(combination_list) } except Exception as e: print("\n".join(combination_list)) print(e) return None print("succeee!!") print("\n".join(combination_list)) print(f"saved video as output{i}.png") return result if __name__=="__main__": combinations = list(product([True, False], repeat=5)) result_list = [] for i, combination in enumerate(combinations): reset_memory() result = main(i, combination) if result is not None: result_list.append(result) print("Sorted by time taken") time_sorted_list = sorted(result_list, key=lambda x: x["time_required"]) for time_sorted in time_sorted_list: print(time_sorted["combination"]) print(f"time: {time_sorted["time_required"]} sec") print(f"memory: {time_sorted["memory"]} GB") print() print("Sorted by memory used") memory_sorted_list = sorted(result_list, key=lambda x: x["memory"]) for memory_sorted in memory_sorted_list: print(memory_sorted["combination"]) print(f"memory: {memory_sorted["memory"]} GB") print(f"time: {memory_sorted["time_required"]} sec") print()
ベンチマーク結果
かかった時間でソート
enable_model_cpu_offload()
enable_sequential_cpu_offload()
time: 120.92 sec
memory: 3.07 GB
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_slicing()
time: 121.53 sec
memory: 3.07 GB
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_tiling()
time: 121.65 sec
memory: 3.07 GB
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_tiling()
time: 137.86 sec
memory: 58.26 GB
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
time: 139.09 sec
memory: 58.26 GB
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_slicing()
time: 139.11 sec
memory: 58.26 GB
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_slicing()
enable_vae_tiling()
time: 151.46 sec
memory: 58.26 GB
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_slicing()
enable_vae_tiling()
time: 168.88 sec
memory: 3.07 GB
VRAM使用量でソート
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_slicing()
enable_vae_tiling()
memory: 3.07 GB
time: 168.88 sec
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_slicing()
memory: 3.07 GB
time: 121.53 sec
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_tiling()
memory: 3.07 GB
time: 121.65 sec
enable_model_cpu_offload()
enable_sequential_cpu_offload()
memory: 3.07 GB
time: 120.92 sec
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_slicing()
enable_vae_tiling()
memory: 58.26 GB
time: 151.46 sec
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_slicing()
memory: 58.26 GB
time: 139.11 sec
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
enable_vae_tiling()
memory: 58.26 GB
time: 137.86 sec
to("cada")
enable_model_cpu_offload()
enable_sequential_cpu_offload()
memory: 58.26 GB
time: 139.09 sec