diff --git a/README.md b/README.md index a7d855df0..e448dfa37 100644 --- a/README.md +++ b/README.md @@ -44,11 +44,11 @@ It is recommended to install Modalities via uv or install PyTorch, psutil and Ni # Get uv (tested with uv version 0.9.13) curl -LsSf https://astral.sh/uv/install.sh | sh -uv sync +uv sync --extra [cpu|cu126|cu128|cu130] # Get CUDA version via nvidia-smi source .venv/bin/activate # For developers: use [tests,linting] and install pre-commit hooks -uv sync --extra tests --extra linting +uv sync --extra [cpu|cu126|cu128|cu130] --extra tests --extra linting pre-commit install --install-hooks ``` @@ -60,7 +60,7 @@ conda create -n modalities python=3.13 conda activate modalities # Install PyTorch, psutil, Ninja and Flash Attention -pip install "torch<2.11.0" +pip install "torch<2.11.0" # Or appropriate version for your CUDA setup. pip install psutil ninja # Ninja lowers compilation time of flash attention significantly pip install flash-attn==2.8.3 --no-build-isolation ``` diff --git a/pyproject.toml b/pyproject.toml index b41fddd08..9ea55a4d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,6 @@ description = "Modalities, a PyTorch-native framework for distributed and reprod readme = "README.md" dependencies = [ "numpy", - "torch<2.11.0", "ninja", "packaging", "tqdm", @@ -25,7 +24,6 @@ dependencies = [ "matplotlib", "wandb", "einops>=0.7.0", - "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'", "debugpy", # For VSCode debugging support ] @@ -33,10 +31,6 @@ dependencies = [ Homepage = "https://github.com/Modalities/modalities" Issues = "https://github.com/Modalities/modalities/issues" -[project.optional-dependencies] -linting = ["pre-commit"] -tests = ["pytest", "pytest-cov", "debugpy"] - [project.scripts] modalities = "modalities.__main__:main" @@ -44,6 +38,72 @@ modalities = "modalities.__main__:main" requires = ["setuptools >= 61.0.0"] build-backend = "setuptools.build_meta" +[project.optional-dependencies] +linting = ["pre-commit"] +tests = ["pytest", "pytest-cov", "debugpy"] + +cpu = ["torch>=2.10,<2.11.0", "torchvision"] +cu126 = [ + "torch>=2.10,<2.11.0", + "torchvision", + "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'" +] +cu128 = [ + "torch>=2.10,<2.11.0", + "torchvision", + "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'" +] +cu130 = [ + "torch>=2.10,<2.11.0", + "torchvision", + "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'" +] + +[tool.uv] +conflicts = [ + [ + { extra = "cpu" }, + { extra = "cu126" }, + { extra = "cu128" }, + { extra = "cu130" }, + ], +] + +[tool.uv.sources] +torch = [ + { index = "pytorch-cpu", extra = "cpu" }, + { index = "pytorch-cu126", extra = "cu126" }, + { index = "pytorch-cu128", extra = "cu128" }, + { index = "pytorch-cu130", extra = "cu130" }, +] +torchvision = [ + { index = "pytorch-cpu", extra = "cpu" }, + { index = "pytorch-cu126", extra = "cu126" }, + { index = "pytorch-cu128", extra = "cu128" }, + { index = "pytorch-cu130", extra = "cu130" }, +] + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cu126" +url = "https://download.pytorch.org/whl/cu126" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cu130" +url = "https://download.pytorch.org/whl/cu130" +explicit = true + + [tool.uv.extra-build-dependencies] flash-attn = [ { requirement = "torch", match-runtime = true }, diff --git a/src/modalities/utils/mfu.py b/src/modalities/utils/mfu.py index 76c63d122..201577702 100644 --- a/src/modalities/utils/mfu.py +++ b/src/modalities/utils/mfu.py @@ -14,7 +14,8 @@ # https://www.nvidia.com/en-us/data-center/h100/ # # NOTE: These values are valid for fp16 and bf16 only -PEAK_PERFORMANCE = {"A100": 312e12, "H100": 989e12, "GH200": 989e12} +# FIXME: For B200, is 2.25 PFLOPS correct or should it be 4.5 PFLOPS? +PEAK_PERFORMANCE = {"A100": 312e12, "H100": 989e12, "GH200": 989e12, "B200": 2.25e15} class MFUCalculatorABC: @@ -130,6 +131,10 @@ def _get_theoretical_gpu_peak_performance(model_parts: FSDPX | list[FSDP2], worl single_gpu_peak_performance = MFUCalculatorABC._get_theoretical_gpu_peak_performance_single( precision, "GH200" ) + elif device_name.startswith("NVIDIA B200"): + single_gpu_peak_performance = MFUCalculatorABC._get_theoretical_gpu_peak_performance_single( + precision, "B200" + ) else: warnings.warn(f"Could not get theoretical GPU peak performance for unknown device = {device_name}.") return None