From 8debc5e6306c36325b90c325ac72ae58c2b5ce07 Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Wed, 6 May 2026 20:03:47 -0700 Subject: [PATCH 01/15] feat(inference): add ServeConfig and _EventLoopManager Co-Authored-By: Claude Opus 4.7 (1M context) --- megatron/inference/__init__.py | 14 +++++ megatron/inference/_llm_base.py | 82 ++++++++++++++++++++++++++++++ megatron/inference/serve_config.py | 71 ++++++++++++++++++++++++++ 3 files changed, 167 insertions(+) create mode 100644 megatron/inference/_llm_base.py create mode 100644 megatron/inference/serve_config.py diff --git a/megatron/inference/__init__.py b/megatron/inference/__init__.py index 26496bfed70..3795e21593e 100644 --- a/megatron/inference/__init__.py +++ b/megatron/inference/__init__.py @@ -1 +1,15 @@ # Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +from megatron.core.inference.inference_request import ( + DynamicInferenceRequest, + DynamicInferenceRequestRecord, +) +from megatron.core.inference.sampling_params import SamplingParams +from megatron.inference.serve_config import ServeConfig + +__all__ = [ + "DynamicInferenceRequest", + "DynamicInferenceRequestRecord", + "SamplingParams", + "ServeConfig", +] diff --git a/megatron/inference/_llm_base.py b/megatron/inference/_llm_base.py new file mode 100644 index 00000000000..d41368236f7 --- /dev/null +++ b/megatron/inference/_llm_base.py @@ -0,0 +1,82 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +"""Internal building blocks for the Megatron inference high-level API. + +This module hosts private helpers shared by the future ``MegatronLLM`` and +``MegatronAsyncLLM`` classes. In Stage 1 only ``_EventLoopManager`` is +defined; the coordinator runtime and the base class are added in later +stages. +""" + +import asyncio +import concurrent.futures +import threading +import time +from typing import Coroutine + + +class _EventLoopManager: + """Per-instance background daemon thread + persistent asyncio event loop. + + Bridges sync and async user-thread callers to coroutines that run on the + background loop via ``asyncio.run_coroutine_threadsafe``. Mirrors the + pattern used by NeMo RL's inference worker. + """ + + def __init__(self) -> None: + self._loop: asyncio.AbstractEventLoop | None = None + self._thread: threading.Thread | None = None + self._started: bool = False + self._stopped: bool = False + + def start(self) -> None: + """Spawn the daemon thread and start the event loop. Idempotent.""" + if self._started: + return + + def _run_loop() -> None: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + self._loop = loop + loop.run_forever() + + self._thread = threading.Thread(target=_run_loop, daemon=True) + self._thread.start() + + # Wait for the loop to be created and running before returning so + # callers can use ``submit`` immediately. Mirrors NeMo RL's polling + # approach. + while self._loop is None or not self._loop.is_running(): + time.sleep(0.001) + + self._started = True + + def submit(self, coro: Coroutine) -> "concurrent.futures.Future": + """Schedule ``coro`` on the background loop and return its future. + + The caller decides how to wait on the returned future (e.g. + ``.result()`` for blocking sync, ``asyncio.wrap_future(...)`` for + awaiting from another loop). + """ + if not self._started or self._loop is None: + raise RuntimeError("_EventLoopManager.start() must be called before submit().") + return asyncio.run_coroutine_threadsafe(coro, self._loop) + + def run_sync(self, coro: Coroutine): + """Schedule ``coro`` on the background loop and block on its result.""" + return self.submit(coro).result() + + async def run_async(self, coro: Coroutine): + """Schedule ``coro`` on the background loop and await it from any loop.""" + return await asyncio.wrap_future(self.submit(coro)) + + def stop(self) -> None: + """Stop the event loop and join the background thread. Idempotent.""" + if not self._started or self._stopped: + return + assert self._loop is not None + assert self._thread is not None + self._loop.call_soon_threadsafe(self._loop.stop) + self._thread.join() + self._stopped = True + self._started = False diff --git a/megatron/inference/serve_config.py b/megatron/inference/serve_config.py new file mode 100644 index 00000000000..ec4c0cd5966 --- /dev/null +++ b/megatron/inference/serve_config.py @@ -0,0 +1,71 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +from dataclasses import dataclass, field +from typing import Literal + + +@dataclass +class ServeConfig: + """Programmatic configuration for ``MegatronAsyncLLM.serve(...)``. + + This dataclass also serves as the future source of truth for a + ``megatron serve`` CLI. It controls only the HTTP serving surface; engine + construction and coordinator addressing are configured separately via the + ``MegatronLLM`` / ``MegatronAsyncLLM`` constructor. + """ + + host: str = "0.0.0.0" + """HTTP bind host for the OpenAI-compatible frontend. + + Distinct from the ``MegatronLLM`` / ``MegatronAsyncLLM`` constructor's + ``coordinator_host`` argument: ``coordinator_host`` is the internal/routable + address used for coordinator ZMQ traffic, whereas ``host`` is the + externally-visible interface where the HTTP server accepts client + connections. + """ + + port: int = 5000 + """HTTP bind port for the OpenAI-compatible frontend.""" + + model_name: str = "megatron-model" + """Served OpenAI model name. + + Echoed in HTTP responses regardless of ``strict_model_name``. The + ``/v1/models`` endpoint always returns this value as the single advertised + model id. + """ + + strict_model_name: bool = True + """Whether to validate the request ``model`` field against ``model_name``. + + If True, requests whose ``model`` field does not match ``model_name`` are + rejected with HTTP 400 in OpenAI's error shape. If False, the request is + accepted regardless of the supplied ``model`` value. + """ + + role: Literal["primary", "worker", "auto"] = "auto" + """Per-rank role selector for the serving frontend. + + - ``"primary"``: this rank exposes the HTTP frontend. + - ``"worker"``: this rank does not expose HTTP; it participates in the + dynamic engine loop only. + - ``"auto"``: automatically picks ``"primary"`` on global rank 0 and + ``"worker"`` elsewhere. + """ + + parsers: list[str] = field(default_factory=list) + """Response parser names to enable on the HTTP frontend. + + Examples include ``["json", "tool_use"]``. Values are passed through to the + underlying text-generation server unchanged. + """ + + verbose: bool = False + """Whether the HTTP frontend should log per-request detail.""" + + frontend_replicas: int = 4 + """Number of HTTP frontend processes spawned on the primary rank. + + The default of 4 matches the existing ``start_text_gen_server`` default of + ``num_replicas=4``. + """ From 4ca3c951ee6ccfe31833c14a415639b4b9d04d58 Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Wed, 6 May 2026 21:48:10 -0700 Subject: [PATCH 02/15] feat(inference): add coordinator runtime and _MegatronLLMBase Co-Authored-By: Claude Opus 4.7 (1M context) --- megatron/inference/_llm_base.py | 443 +++++++++++++++++++++++++++++++- 1 file changed, 437 insertions(+), 6 deletions(-) diff --git a/megatron/inference/_llm_base.py b/megatron/inference/_llm_base.py index d41368236f7..755145acd71 100644 --- a/megatron/inference/_llm_base.py +++ b/megatron/inference/_llm_base.py @@ -3,24 +3,35 @@ """Internal building blocks for the Megatron inference high-level API. This module hosts private helpers shared by the future ``MegatronLLM`` and -``MegatronAsyncLLM`` classes. In Stage 1 only ``_EventLoopManager`` is -defined; the coordinator runtime and the base class are added in later -stages. +``MegatronAsyncLLM`` classes: ``_EventLoopManager`` (Stage 1), +``_CoordinatorRuntime`` and ``_MegatronLLMBase`` (Stage 2). The public +sync/async wrappers are added in subsequent stages. """ import asyncio import concurrent.futures import threading import time -from typing import Coroutine +from typing import Coroutine, List, Optional, Tuple, Union + +from megatron.core.inference.config import InferenceConfig +from megatron.core.inference.contexts.dynamic_context import DynamicInferenceContext +from megatron.core.inference.engines.dynamic_engine import DynamicInferenceEngine, EngineState +from megatron.core.inference.inference_request import DynamicInferenceRequest +from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import ( + GPTInferenceWrapper, +) +from megatron.core.inference.sampling_params import SamplingParams +from megatron.core.inference.text_generation_controllers.text_generation_controller import ( + TextGenerationController, +) class _EventLoopManager: """Per-instance background daemon thread + persistent asyncio event loop. Bridges sync and async user-thread callers to coroutines that run on the - background loop via ``asyncio.run_coroutine_threadsafe``. Mirrors the - pattern used by NeMo RL's inference worker. + background loop via ``asyncio.run_coroutine_threadsafe``. """ def __init__(self) -> None: @@ -51,6 +62,13 @@ def _run_loop() -> None: self._started = True + @property + def loop(self) -> asyncio.AbstractEventLoop: + """The background asyncio loop. Raises if ``start()`` has not been called.""" + if not self._started or self._loop is None: + raise RuntimeError("_EventLoopManager.start() must be called before accessing loop.") + return self._loop + def submit(self, coro: Coroutine) -> "concurrent.futures.Future": """Schedule ``coro`` on the background loop and return its future. @@ -80,3 +98,416 @@ def stop(self) -> None: self._thread.join() self._stopped = True self._started = False + + +class _CoordinatorRuntime: + """Owns the dynamic-inference coordinator and ``InferenceClient`` lifecycle. + + Async-native: :meth:`setup` and :meth:`teardown` are coroutines meant to + run on a background loop owned by :class:`_EventLoopManager`. The primary + rank additionally holds an :class:`InferenceClient` used by the high-level + API to submit requests and send control signals. + """ + + def __init__( + self, + engine: "DynamicInferenceEngine", + *, + is_primary: bool, + coordinator_host: Optional[str], + coordinator_port: Optional[int], + ) -> None: + self._engine = engine + self._is_primary = is_primary + self._coordinator_host = coordinator_host + self._coordinator_port = coordinator_port + self._client: "InferenceClient | None" = None + self._coord_addr: Optional[str] = None + + async def setup(self, *, loop: asyncio.AbstractEventLoop) -> None: + """Bring the coordinator and (on primary) the ``InferenceClient`` up. + + Calls ``engine.start_listening_to_data_parallel_coordinator(loop=loop)`` + on every rank. Only host/port kwargs that the caller actually supplied + are forwarded so the engine can auto-bind when both are ``None``. + """ + kwargs = {"loop": loop} + if self._coordinator_host is not None: + kwargs["hostname"] = self._coordinator_host + if self._coordinator_port is not None: + kwargs["inference_coordinator_port"] = self._coordinator_port + + coord_addr = await self._engine.start_listening_to_data_parallel_coordinator(**kwargs) + self._coord_addr = coord_addr + + if self._is_primary: + # Lazy import: keep this module importable without pyzmq/msgpack + # installed when the user only needs direct mode. + from megatron.core.inference.inference_client import InferenceClient + + client = InferenceClient(coord_addr) + client.start(loop=loop) + self._client = client + + async def teardown(self) -> None: + """Primary-only client shutdown. + + Worker ranks are no-ops here; their ``engine_loop_task`` is awaited by + :meth:`_MegatronLLMBase._shutdown_impl` after the primary has issued + the STOP signal. + """ + if not self._is_primary: + return + assert self._client is not None + self._client.shutdown_coordinator() + self._client.stop() + + @property + def client(self) -> "InferenceClient | None": + """The :class:`InferenceClient` on the primary rank; ``None`` on workers.""" + return self._client + + @property + def coord_addr(self) -> Optional[str]: + """Address returned by ``start_listening_to_data_parallel_coordinator``.""" + return self._coord_addr + + +class _MegatronLLMBase: + """Shared base for ``MegatronLLM`` and ``MegatronAsyncLLM``. + + Public async methods (``generate``, ``pause``, ``unpause``, ``suspend``, + ``resume``, ``shutdown``, ``wait_for_shutdown``) are inherited as-is by + ``MegatronAsyncLLM`` and overridden with sync versions by ``MegatronLLM``. + The actual work runs inside private ``__impl`` coroutines that are + scheduled on the background runtime loop in coordinator mode. + + Two execution modes are supported: + + - **Direct mode** (``use_coordinator=False``): every rank is treated as + primary and ``generate`` runs the engine synchronously (offloaded to a + thread when called from an event loop). ``generate`` is single-caller in + direct mode -- concurrent calls (e.g. via ``asyncio.gather``) raise + :class:`RuntimeError`; pass a list of prompts instead. Lifecycle methods + raise :class:`RuntimeError`. + - **Coordinator mode** (``use_coordinator=True``): a background event loop + hosts the engine pipeline and an :class:`InferenceClient` (on global + rank 0). Only the primary rank may submit requests via ``generate``. + + ``model`` must be in eval mode before construction; this class does not + modify the model state. + """ + + def __init__( + self, + *, + model, + tokenizer, + inference_config: Optional[InferenceConfig] = None, + use_coordinator: bool = False, + coordinator_host: Optional[str] = None, + coordinator_port: Optional[int] = None, + ) -> None: + if (coordinator_host is not None or coordinator_port is not None) and not use_coordinator: + raise ValueError("coordinator_host/port require use_coordinator=True") + + if inference_config is None: + inference_config = InferenceConfig() + + # Build the engine pipeline. Mirrors examples/inference/gpt/gpt_dynamic_inference.py. + context = DynamicInferenceContext(model.config, inference_config) + # TODO: extend for non-GPT models in a future iteration. + wrapper = GPTInferenceWrapper(model, context) + controller = TextGenerationController(inference_wrapped_model=wrapper, tokenizer=tokenizer) + engine = DynamicInferenceEngine(controller=controller, context=context) + + if use_coordinator: + # Lazy import so the module imports cleanly without torch installed. + import torch.distributed as dist + + is_primary_rank = dist.get_rank() == 0 + else: + is_primary_rank = True + + self._engine = engine + self._context = context + self._controller = controller + self._use_coordinator = use_coordinator + self._is_primary_rank = is_primary_rank + self._loop_manager: "Optional[_EventLoopManager]" = None + self._coord_runtime: "Optional[_CoordinatorRuntime]" = None + self._shutdown_called: bool = False + self._direct_generate_in_flight: bool = False + + if use_coordinator: + loop_manager = _EventLoopManager() + loop_manager.start() + try: + coord_runtime = _CoordinatorRuntime( + engine, + is_primary=is_primary_rank, + coordinator_host=coordinator_host, + coordinator_port=coordinator_port, + ) + loop_manager.run_sync(coord_runtime.setup(loop=loop_manager.loop)) + except BaseException: + loop_manager.stop() + raise + self._loop_manager = loop_manager + self._coord_runtime = coord_runtime + + # ---- properties ---- + + @property + def is_primary_rank(self) -> bool: + """Whether ``generate`` may be called on this rank.""" + return self._is_primary_rank + + @property + def engine(self) -> "DynamicInferenceEngine": + """The underlying :class:`DynamicInferenceEngine`.""" + return self._engine + + @property + def context(self) -> "DynamicInferenceContext": + """The underlying :class:`DynamicInferenceContext`.""" + return self._context + + @property + def controller(self) -> "TextGenerationController": + """The underlying :class:`TextGenerationController`.""" + return self._controller + + # ---- internal helpers ---- + + def _assert_primary(self) -> None: + if not self._is_primary_rank: + raise RuntimeError( + "generate(...) is only valid on the primary rank in coordinator mode" + ) + + def _assert_coordinator(self) -> None: + if not self._use_coordinator: + raise RuntimeError("This method requires use_coordinator=True") + + def _normalize_prompts( + self, prompts: Union[str, List[int], List[str], List[List[int]]] + ) -> Tuple[Union[List[str], List[List[int]]], bool]: + """Return ``(normalized_list, is_batch_input)``. + + - ``"abc"`` -> ``(["abc"], False)`` + - ``[1, 2, 3]`` -> ``([[1, 2, 3]], False)`` (single token-id prompt) + - ``["abc", "def"]`` -> ``(["abc", "def"], True)`` + - ``[[1, 2], [3, 4]]`` -> ``([[1, 2], [3, 4]], True)`` + - ``[]`` -> ``([], True)`` + + Only the first element is inspected to distinguish single vs batch; + per-element type validation is left to the engine. + """ + if isinstance(prompts, str): + return [prompts], False + if isinstance(prompts, list): + if not prompts: + return [], True + first = prompts[0] + if isinstance(first, int): + return [prompts], False + if isinstance(first, (str, list)): + return prompts, True + raise TypeError( + f"Unsupported prompt element type: {type(first)}; " + "expected str, list[int], list[str], or list[list[int]]." + ) + raise TypeError( + f"prompts must be str, list[int], list[str], or list[list[int]]; " + f"got {type(prompts)}" + ) + + # ---- public async methods (inherited by MegatronAsyncLLM; overridden in MegatronLLM) ---- + + async def generate( + self, + prompts: Union[str, List[int], List[str], List[List[int]]], + sampling_params: Optional[SamplingParams] = None, + ) -> Union["DynamicInferenceRequest", List["DynamicInferenceRequest"]]: + """Run inference for one prompt or a batch of prompts. + + Single input (``str`` or ``list[int]``) returns a single + ``DynamicInferenceRequest``; batched input (``list[str]`` or + ``list[list[int]]``) returns ``list[DynamicInferenceRequest]`` in + input order. + + In direct mode, ``generate`` is single-caller -- concurrent calls raise + ``RuntimeError``. Pass batched input instead of using + ``asyncio.gather``. + + Raises: + RuntimeError: if called on a non-primary rank in coordinator mode, + or if a second concurrent call enters in direct mode. + """ + self._assert_primary() + if sampling_params is None: + sampling_params = SamplingParams() + + normalized, is_batch = self._normalize_prompts(prompts) + + if not normalized: + # Empty batch: nothing to schedule. ``is_batch`` is always True + # here since single input is wrapped to a one-element list. + return [] + + if self._use_coordinator: + assert self._loop_manager is not None + results = await self._loop_manager.run_async( + self._generate_impl(normalized, sampling_params) + ) + else: + if self._direct_generate_in_flight: + raise RuntimeError( + "MegatronAsyncLLM.generate in direct mode is single-caller; " + "pass a list of prompts instead of using asyncio.gather." + ) + self._direct_generate_in_flight = True + try: + results = await self._generate_impl(normalized, sampling_params) + finally: + self._direct_generate_in_flight = False + + return results if is_batch else results[0] + + async def pause(self) -> None: + """Transition the engine to ``PAUSED``. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + await self._loop_manager.run_async(self._pause_impl()) + + async def unpause(self) -> None: + """Transition the engine from ``PAUSED`` back to ``RUNNING``. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + await self._loop_manager.run_async(self._unpause_impl()) + + async def suspend(self) -> None: + """Transition the engine to ``SUSPENDED`` (offloads GPU buffers). + + The caller must ``pause()`` first; this method does not enforce that. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + await self._loop_manager.run_async(self._suspend_impl()) + + async def resume(self) -> None: + """Transition the engine from ``SUSPENDED`` to ``RESUMED``. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + await self._loop_manager.run_async(self._resume_impl()) + + async def shutdown(self) -> None: + """Stop the engine, tear down the coordinator, and join the runtime thread. + + Idempotent. No-op in direct mode. + """ + if self._shutdown_called: + return + self._shutdown_called = True + if not self._use_coordinator: + return + assert self._loop_manager is not None + await self._loop_manager.run_async(self._shutdown_impl()) + # Stop the loop in a worker thread so we don't block the caller's loop. + await asyncio.to_thread(self._loop_manager.stop) + + async def wait_for_shutdown(self) -> None: + """Block until the engine's background loop task terminates. + + No-op in direct mode. + """ + if not self._use_coordinator: + return + assert self._loop_manager is not None + await self._loop_manager.run_async(self._wait_for_shutdown_impl()) + + # ---- private impl coroutines (run on the runtime loop) ---- + # The coordinator requires a long running runtime event loop, so we define these methods + # to route the user's event loop to our runtime loop + + + async def _generate_impl( + self, + prompts: Union[List[str], List[List[int]]], + sp: SamplingParams, + ) -> List["DynamicInferenceRequest"]: + """Run inference for a non-empty list of prompts; returns input-ordered list. + + - Coordinator mode: must run on the runtime loop (via + ``_loop_manager.run_async``); enqueues requests through + ``client.add_request`` and gathers all futures. + - Direct mode: runs on the caller's event loop; offloads the synchronous + ``engine.generate`` to a thread. + """ + if self._use_coordinator: + # ``add_request`` calls ``asyncio.get_running_loop().create_future()`` + # so it must be invoked from a coroutine on the runtime loop. This + # coroutine runs on that same loop, so ``asyncio.gather`` over the + # returned futures is safe. + assert self._coord_runtime is not None and self._coord_runtime.client is not None + futures = [self._coord_runtime.client.add_request(p, sp) for p in prompts] + return list(await asyncio.gather(*futures)) + # Direct mode: ``engine.generate`` accepts ``list[str]`` or + # ``list[list[int]]``; both flow through ``engine.add_request`` which + # accepts ``Union[str, List[int], Tensor]`` despite the narrower declared + # type on ``engine.generate`` itself. TODO: widen that signature upstream. + records = await asyncio.to_thread(self._engine.generate, prompts, sp) + return [r.merge() for r in records] + + async def _pause_impl(self) -> None: + if self._is_primary_rank: + assert self._coord_runtime is not None and self._coord_runtime.client is not None + self._coord_runtime.client.pause_engines() + await self._engine.wait_until(EngineState.PAUSED) + + async def _unpause_impl(self) -> None: + if self._is_primary_rank: + assert self._coord_runtime is not None and self._coord_runtime.client is not None + self._coord_runtime.client.unpause_engines() + await self._engine.wait_until(EngineState.RUNNING) + + async def _suspend_impl(self) -> None: + if self._is_primary_rank: + assert self._coord_runtime is not None and self._coord_runtime.client is not None + self._coord_runtime.client.suspend_engines() + await self._engine.wait_until(EngineState.SUSPENDED) + + async def _resume_impl(self) -> None: + if self._is_primary_rank: + assert self._coord_runtime is not None and self._coord_runtime.client is not None + self._coord_runtime.client.resume_engines() + await self._engine.wait_until(EngineState.RESUMED) + + async def _shutdown_impl(self) -> None: + if self._is_primary_rank: + assert self._coord_runtime is not None and self._coord_runtime.client is not None + self._coord_runtime.client.stop_engines() + await self._engine.wait_until(EngineState.STOPPED) + await self._coord_runtime.teardown() + else: + await self._engine.engine_loop_task + + async def _wait_for_shutdown_impl(self) -> None: + await self._engine.engine_loop_task + From d7e68f135affa78011b27343c7e9a1c084bb6449 Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Wed, 6 May 2026 22:09:23 -0700 Subject: [PATCH 03/15] feat(inference): add MegatronAsyncLLM, slim base class Co-Authored-By: Claude Opus 4.7 (1M context) --- megatron/inference/__init__.py | 2 + megatron/inference/_llm_base.py | 162 +++++------------------------ megatron/inference/async_llm.py | 179 ++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+), 138 deletions(-) create mode 100644 megatron/inference/async_llm.py diff --git a/megatron/inference/__init__.py b/megatron/inference/__init__.py index 3795e21593e..2a230600573 100644 --- a/megatron/inference/__init__.py +++ b/megatron/inference/__init__.py @@ -5,11 +5,13 @@ DynamicInferenceRequestRecord, ) from megatron.core.inference.sampling_params import SamplingParams +from megatron.inference.async_llm import MegatronAsyncLLM from megatron.inference.serve_config import ServeConfig __all__ = [ "DynamicInferenceRequest", "DynamicInferenceRequestRecord", + "MegatronAsyncLLM", "SamplingParams", "ServeConfig", ] diff --git a/megatron/inference/_llm_base.py b/megatron/inference/_llm_base.py index 755145acd71..ecc56cb4329 100644 --- a/megatron/inference/_llm_base.py +++ b/megatron/inference/_llm_base.py @@ -2,10 +2,11 @@ """Internal building blocks for the Megatron inference high-level API. -This module hosts private helpers shared by the future ``MegatronLLM`` and -``MegatronAsyncLLM`` classes: ``_EventLoopManager`` (Stage 1), -``_CoordinatorRuntime`` and ``_MegatronLLMBase`` (Stage 2). The public -sync/async wrappers are added in subsequent stages. +This module hosts private helpers shared by ``MegatronLLM`` and +``MegatronAsyncLLM``: ``_EventLoopManager``, ``_CoordinatorRuntime``, and +``_MegatronLLMBase``. The public sync/async wrappers live on the subclasses; +this base only exposes shared engine state, runtime spawn, validation +helpers, and the private ``__impl`` coroutines. """ import asyncio @@ -174,22 +175,24 @@ def coord_addr(self) -> Optional[str]: class _MegatronLLMBase: - """Shared base for ``MegatronLLM`` and ``MegatronAsyncLLM``. + """Private base shared by ``MegatronLLM`` and ``MegatronAsyncLLM``. - Public async methods (``generate``, ``pause``, ``unpause``, ``suspend``, - ``resume``, ``shutdown``, ``wait_for_shutdown``) are inherited as-is by - ``MegatronAsyncLLM`` and overridden with sync versions by ``MegatronLLM``. - The actual work runs inside private ``__impl`` coroutines that are - scheduled on the background runtime loop in coordinator mode. + This base intentionally exposes no public ``generate`` / lifecycle + methods -- those live on the subclasses, which call into the private + ``__impl`` coroutines defined here. The base owns: + + - the engine pipeline (engine, context, controller), + - the per-instance background runtime (``_loop_manager``, + ``_coord_runtime``) when ``use_coordinator=True``, + - validation helpers (``_assert_primary``, ``_assert_coordinator``) and + the input shape helper (``_normalize_prompts``). Two execution modes are supported: - **Direct mode** (``use_coordinator=False``): every rank is treated as primary and ``generate`` runs the engine synchronously (offloaded to a - thread when called from an event loop). ``generate`` is single-caller in - direct mode -- concurrent calls (e.g. via ``asyncio.gather``) raise - :class:`RuntimeError`; pass a list of prompts instead. Lifecycle methods - raise :class:`RuntimeError`. + thread when called from an event loop). Lifecycle methods are invalid + and raise :class:`RuntimeError` via ``_assert_coordinator``. - **Coordinator mode** (``use_coordinator=True``): a background event loop hosts the engine pipeline and an :class:`InferenceClient` (on global rank 0). Only the primary rank may submit requests via ``generate``. @@ -237,7 +240,6 @@ def __init__( self._loop_manager: "Optional[_EventLoopManager]" = None self._coord_runtime: "Optional[_CoordinatorRuntime]" = None self._shutdown_called: bool = False - self._direct_generate_in_flight: bool = False if use_coordinator: loop_manager = _EventLoopManager() @@ -323,129 +325,13 @@ def _normalize_prompts( f"got {type(prompts)}" ) - # ---- public async methods (inherited by MegatronAsyncLLM; overridden in MegatronLLM) ---- - - async def generate( - self, - prompts: Union[str, List[int], List[str], List[List[int]]], - sampling_params: Optional[SamplingParams] = None, - ) -> Union["DynamicInferenceRequest", List["DynamicInferenceRequest"]]: - """Run inference for one prompt or a batch of prompts. - - Single input (``str`` or ``list[int]``) returns a single - ``DynamicInferenceRequest``; batched input (``list[str]`` or - ``list[list[int]]``) returns ``list[DynamicInferenceRequest]`` in - input order. - - In direct mode, ``generate`` is single-caller -- concurrent calls raise - ``RuntimeError``. Pass batched input instead of using - ``asyncio.gather``. - - Raises: - RuntimeError: if called on a non-primary rank in coordinator mode, - or if a second concurrent call enters in direct mode. - """ - self._assert_primary() - if sampling_params is None: - sampling_params = SamplingParams() - - normalized, is_batch = self._normalize_prompts(prompts) - - if not normalized: - # Empty batch: nothing to schedule. ``is_batch`` is always True - # here since single input is wrapped to a one-element list. - return [] - - if self._use_coordinator: - assert self._loop_manager is not None - results = await self._loop_manager.run_async( - self._generate_impl(normalized, sampling_params) - ) - else: - if self._direct_generate_in_flight: - raise RuntimeError( - "MegatronAsyncLLM.generate in direct mode is single-caller; " - "pass a list of prompts instead of using asyncio.gather." - ) - self._direct_generate_in_flight = True - try: - results = await self._generate_impl(normalized, sampling_params) - finally: - self._direct_generate_in_flight = False - - return results if is_batch else results[0] - - async def pause(self) -> None: - """Transition the engine to ``PAUSED``. - - Raises: - RuntimeError: in direct mode (``use_coordinator=False``). - """ - self._assert_coordinator() - assert self._loop_manager is not None - await self._loop_manager.run_async(self._pause_impl()) - - async def unpause(self) -> None: - """Transition the engine from ``PAUSED`` back to ``RUNNING``. - - Raises: - RuntimeError: in direct mode (``use_coordinator=False``). - """ - self._assert_coordinator() - assert self._loop_manager is not None - await self._loop_manager.run_async(self._unpause_impl()) - - async def suspend(self) -> None: - """Transition the engine to ``SUSPENDED`` (offloads GPU buffers). - - The caller must ``pause()`` first; this method does not enforce that. - - Raises: - RuntimeError: in direct mode (``use_coordinator=False``). - """ - self._assert_coordinator() - assert self._loop_manager is not None - await self._loop_manager.run_async(self._suspend_impl()) - - async def resume(self) -> None: - """Transition the engine from ``SUSPENDED`` to ``RESUMED``. - - Raises: - RuntimeError: in direct mode (``use_coordinator=False``). - """ - self._assert_coordinator() - assert self._loop_manager is not None - await self._loop_manager.run_async(self._resume_impl()) - - async def shutdown(self) -> None: - """Stop the engine, tear down the coordinator, and join the runtime thread. - - Idempotent. No-op in direct mode. - """ - if self._shutdown_called: - return - self._shutdown_called = True - if not self._use_coordinator: - return - assert self._loop_manager is not None - await self._loop_manager.run_async(self._shutdown_impl()) - # Stop the loop in a worker thread so we don't block the caller's loop. - await asyncio.to_thread(self._loop_manager.stop) - - async def wait_for_shutdown(self) -> None: - """Block until the engine's background loop task terminates. - - No-op in direct mode. - """ - if not self._use_coordinator: - return - assert self._loop_manager is not None - await self._loop_manager.run_async(self._wait_for_shutdown_impl()) - - # ---- private impl coroutines (run on the runtime loop) ---- - # The coordinator requires a long running runtime event loop, so we define these methods - # to route the user's event loop to our runtime loop - + # ---- private impl coroutines ---- + # Subclasses' public methods bridge to these via ``_EventLoopManager`` + # (coordinator mode, on the runtime loop) or await them directly + # (direct mode, on the caller's event loop). + # We need this bridge in coordinator mode because the coordinator requires + # a long running event loop, so we need to route the user's event + # loop to our runtime loop async def _generate_impl( self, diff --git a/megatron/inference/async_llm.py b/megatron/inference/async_llm.py new file mode 100644 index 00000000000..a7c8a8e2951 --- /dev/null +++ b/megatron/inference/async_llm.py @@ -0,0 +1,179 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +"""Async high-level inference API for Megatron (``MegatronAsyncLLM``).""" + +import asyncio +from typing import List, Optional, Union + +from megatron.core.inference.config import InferenceConfig +from megatron.core.inference.inference_request import DynamicInferenceRequest +from megatron.core.inference.sampling_params import SamplingParams +from megatron.inference._llm_base import _MegatronLLMBase + + +class MegatronAsyncLLM(_MegatronLLMBase): + """Async high-level inference API for Megatron. + + Asyncio-native wrapper over the shared engine + runtime managed by + :class:`_MegatronLLMBase` -- see that class for execution modes + (direct vs coordinator), caller responsibilities, and the + ``model.eval()`` contract. + + On top of the base, this class provides: + + - ``async generate`` accepting single or batched prompts. In direct mode + it is single-caller -- concurrent calls (e.g. via ``asyncio.gather``) + raise :class:`RuntimeError`; pass a list of prompts to batch. + - ``async`` lifecycle controls: ``pause`` / ``unpause`` / ``suspend`` / + ``resume`` / ``shutdown`` / ``wait_for_shutdown``. + - ``async with`` context-manager protocol; exit calls :meth:`shutdown`. + + Note: + ``serve()`` (online HTTP serving) is not yet implemented and will be + added in a later stage. + """ + + def __init__( + self, + *, + model, + tokenizer, + inference_config: Optional[InferenceConfig] = None, + use_coordinator: bool = False, + coordinator_host: Optional[str] = None, + coordinator_port: Optional[int] = None, + ) -> None: + super().__init__( + model=model, + tokenizer=tokenizer, + inference_config=inference_config, + use_coordinator=use_coordinator, + coordinator_host=coordinator_host, + coordinator_port=coordinator_port, + ) + # Concurrency guard for direct-mode generate (asyncio is single-threaded; + # a plain bool is sufficient). + self._direct_generate_in_flight: bool = False + + async def generate( + self, + prompts: Union[str, List[int], List[str], List[List[int]]], + sampling_params: Optional[SamplingParams] = None, + ) -> Union["DynamicInferenceRequest", List["DynamicInferenceRequest"]]: + """Run inference for one prompt or a batch of prompts. + + Single input (``str`` or ``list[int]``) returns a single + ``DynamicInferenceRequest``; batched input (``list[str]`` or + ``list[list[int]]``) returns ``list[DynamicInferenceRequest]`` in + input order. + + In direct mode, ``generate`` is single-caller -- concurrent calls raise + ``RuntimeError``. Pass batched input instead of using + ``asyncio.gather``. + + Raises: + RuntimeError: if called on a non-primary rank in coordinator mode, + or if a second concurrent call enters in direct mode. + """ + self._assert_primary() + if sampling_params is None: + sampling_params = SamplingParams() + + normalized, is_batch = self._normalize_prompts(prompts) + + if not normalized: + # Empty batch: nothing to schedule. ``is_batch`` is always True + # here since single input is wrapped to a one-element list. + return [] + + if self._use_coordinator: + assert self._loop_manager is not None + results = await self._loop_manager.run_async( + self._generate_impl(normalized, sampling_params) + ) + else: + if self._direct_generate_in_flight: + raise RuntimeError( + "MegatronAsyncLLM.generate in direct mode is single-caller; " + "pass a list of prompts instead of using asyncio.gather." + ) + self._direct_generate_in_flight = True + try: + results = await self._generate_impl(normalized, sampling_params) + finally: + self._direct_generate_in_flight = False + + return results if is_batch else results[0] + + async def pause(self) -> None: + """Transition the engine to ``PAUSED``. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + await self._loop_manager.run_async(self._pause_impl()) + + async def unpause(self) -> None: + """Transition the engine from ``PAUSED`` back to ``RUNNING``. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + await self._loop_manager.run_async(self._unpause_impl()) + + async def suspend(self) -> None: + """Transition the engine to ``SUSPENDED`` (offloads GPU buffers). + + The caller must ``pause()`` first; this method does not enforce that. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + await self._loop_manager.run_async(self._suspend_impl()) + + async def resume(self) -> None: + """Transition the engine from ``SUSPENDED`` to ``RESUMED``. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + await self._loop_manager.run_async(self._resume_impl()) + + async def shutdown(self) -> None: + """Stop the engine, tear down the coordinator, and join the runtime thread. + + Idempotent. No-op in direct mode. + """ + if self._shutdown_called: + return + self._shutdown_called = True + if not self._use_coordinator: + return + assert self._loop_manager is not None + await self._loop_manager.run_async(self._shutdown_impl()) + # Stop the loop in a worker thread so we don't block the caller's loop. + await asyncio.to_thread(self._loop_manager.stop) + + async def wait_for_shutdown(self) -> None: + """Block until the engine's background loop task terminates. + + No-op in direct mode. + """ + if not self._use_coordinator: + return + assert self._loop_manager is not None + await self._loop_manager.run_async(self._wait_for_shutdown_impl()) + + async def __aenter__(self) -> "MegatronAsyncLLM": + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: + await self.shutdown() From c03ab483e69e6b7770af83263c4c7cf31d00a8fc Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Wed, 6 May 2026 22:20:52 -0700 Subject: [PATCH 04/15] feat(inference): add MegatronLLM Co-Authored-By: Claude Opus 4.7 (1M context) --- megatron/inference/__init__.py | 2 + megatron/inference/llm.py | 153 +++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 megatron/inference/llm.py diff --git a/megatron/inference/__init__.py b/megatron/inference/__init__.py index 2a230600573..a4f6ea2ae75 100644 --- a/megatron/inference/__init__.py +++ b/megatron/inference/__init__.py @@ -6,12 +6,14 @@ ) from megatron.core.inference.sampling_params import SamplingParams from megatron.inference.async_llm import MegatronAsyncLLM +from megatron.inference.llm import MegatronLLM from megatron.inference.serve_config import ServeConfig __all__ = [ "DynamicInferenceRequest", "DynamicInferenceRequestRecord", "MegatronAsyncLLM", + "MegatronLLM", "SamplingParams", "ServeConfig", ] diff --git a/megatron/inference/llm.py b/megatron/inference/llm.py new file mode 100644 index 00000000000..9a2159fdec8 --- /dev/null +++ b/megatron/inference/llm.py @@ -0,0 +1,153 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +"""Sync high-level inference API for Megatron (``MegatronLLM``).""" + +from typing import List, Optional, Union + +from megatron.core.inference.config import InferenceConfig +from megatron.core.inference.inference_request import DynamicInferenceRequest +from megatron.core.inference.sampling_params import SamplingParams +from megatron.inference._llm_base import _MegatronLLMBase + + +class MegatronLLM(_MegatronLLMBase): + """Sync high-level inference API for Megatron. + + See :class:`_MegatronLLMBase` for execution modes (direct vs + coordinator), caller responsibilities, and the ``model.eval()`` contract. + + On top of the base, this class provides: + + - :meth:`generate` accepting one prompt or a batch; **always returns a + ``list[DynamicInferenceRequest]``** (single-prompt input returns a + one-element list -- deliberate asymmetry vs the async API). + - Sync lifecycle controls: :meth:`pause` / :meth:`unpause` / + :meth:`suspend` / :meth:`resume` / :meth:`shutdown` / + :meth:`wait_for_shutdown`. + - Context-manager protocol: ``with MegatronLLM(...) as llm:``; exit + calls :meth:`shutdown`. + + Note: + ``serve()`` (online HTTP serving) is async-only by design; use + :class:`MegatronAsyncLLM` for serving. + """ + + def __init__( + self, + *, + model, + tokenizer, + inference_config: Optional[InferenceConfig] = None, + use_coordinator: bool = False, + coordinator_host: Optional[str] = None, + coordinator_port: Optional[int] = None, + ) -> None: + super().__init__( + model=model, + tokenizer=tokenizer, + inference_config=inference_config, + use_coordinator=use_coordinator, + coordinator_host=coordinator_host, + coordinator_port=coordinator_port, + ) + + def generate( + self, + prompts: Union[str, List[int], List[str], List[List[int]]], + sampling_params: Optional[SamplingParams] = None, + ) -> List["DynamicInferenceRequest"]: + """Run inference for one prompt or a batch. + + Returns ``list[DynamicInferenceRequest]`` in input order. Single-prompt + input returns a one-element list -- the always-list shape is the + deliberate sync-vs-async asymmetry. + + No concurrency guard: sync is single-caller by Python's GIL. If you + need to call ``generate`` concurrently from multiple threads, callers + must serialize externally. + + Raises: + RuntimeError: if called on a non-primary rank in coordinator mode. + """ + self._assert_primary() + if sampling_params is None: + sampling_params = SamplingParams() + + normalized, _is_batch = self._normalize_prompts(prompts) + if not normalized: + return [] + + if self._use_coordinator: + assert self._loop_manager is not None + return self._loop_manager.run_sync(self._generate_impl(normalized, sampling_params)) + # Direct mode: bypass _generate_impl (which would use to_thread, + # pointless for sync). Call the engine directly and merge. + records = self._engine.generate(normalized, sampling_params) + return [r.merge() for r in records] + + def pause(self) -> None: + """Transition the engine to ``PAUSED``. Coordinator mode only. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + self._loop_manager.run_sync(self._pause_impl()) + + def unpause(self) -> None: + """Transition the engine from ``PAUSED`` back to ``RUNNING``. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + self._loop_manager.run_sync(self._unpause_impl()) + + def suspend(self) -> None: + """Transition the engine to ``SUSPENDED`` (offloads GPU buffers). + + The caller must ``pause()`` first; this method does not enforce that. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + self._loop_manager.run_sync(self._suspend_impl()) + + def resume(self) -> None: + """Transition the engine from ``SUSPENDED`` to ``RESUMED``. + + Raises: + RuntimeError: in direct mode (``use_coordinator=False``). + """ + self._assert_coordinator() + assert self._loop_manager is not None + self._loop_manager.run_sync(self._resume_impl()) + + def shutdown(self) -> None: + """Tear down the engine and runtime. Idempotent. Direct mode is a no-op.""" + if self._shutdown_called: + return + self._shutdown_called = True + if not self._use_coordinator: + return # direct mode: nothing to tear down + assert self._loop_manager is not None + self._loop_manager.run_sync(self._shutdown_impl()) + # Sync caller already on its own thread; no need for to_thread. + self._loop_manager.stop() + + def wait_for_shutdown(self) -> None: + """Block until the engine loop terminates. Direct mode no-op.""" + if not self._use_coordinator: + return + assert self._loop_manager is not None + self._loop_manager.run_sync(self._wait_for_shutdown_impl()) + + def __enter__(self) -> "MegatronLLM": + return self + + def __exit__(self, exc_type, exc, tb) -> None: + self.shutdown() From 5c38044a86141d2d5bec4269955da0e2d93acc7c Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Wed, 6 May 2026 22:43:02 -0700 Subject: [PATCH 05/15] refactor(inference): drop model_name fields from ServeConfig Co-Authored-By: Claude Opus 4.7 (1M context) --- megatron/inference/serve_config.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/megatron/inference/serve_config.py b/megatron/inference/serve_config.py index ec4c0cd5966..ff6a4ad209a 100644 --- a/megatron/inference/serve_config.py +++ b/megatron/inference/serve_config.py @@ -27,22 +27,6 @@ class ServeConfig: port: int = 5000 """HTTP bind port for the OpenAI-compatible frontend.""" - model_name: str = "megatron-model" - """Served OpenAI model name. - - Echoed in HTTP responses regardless of ``strict_model_name``. The - ``/v1/models`` endpoint always returns this value as the single advertised - model id. - """ - - strict_model_name: bool = True - """Whether to validate the request ``model`` field against ``model_name``. - - If True, requests whose ``model`` field does not match ``model_name`` are - rejected with HTTP 400 in OpenAI's error shape. If False, the request is - accepted regardless of the supplied ``model`` value. - """ - role: Literal["primary", "worker", "auto"] = "auto" """Per-rank role selector for the serving frontend. From 6331cc04e6e3cdaea0dbb8d17bb17a40c5023faf Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Wed, 6 May 2026 23:19:52 -0700 Subject: [PATCH 06/15] feat(inference): add MegatronAsyncLLM.serve(), drop ServeConfig.role Co-Authored-By: Claude Opus 4.7 (1M context) --- megatron/inference/async_llm.py | 81 +++++++++++++++++++++++++++--- megatron/inference/serve_config.py | 11 ---- 2 files changed, 74 insertions(+), 18 deletions(-) diff --git a/megatron/inference/async_llm.py b/megatron/inference/async_llm.py index a7c8a8e2951..edc3b1407af 100644 --- a/megatron/inference/async_llm.py +++ b/megatron/inference/async_llm.py @@ -2,13 +2,13 @@ """Async high-level inference API for Megatron (``MegatronAsyncLLM``).""" -import asyncio from typing import List, Optional, Union from megatron.core.inference.config import InferenceConfig from megatron.core.inference.inference_request import DynamicInferenceRequest from megatron.core.inference.sampling_params import SamplingParams from megatron.inference._llm_base import _MegatronLLMBase +from megatron.inference.serve_config import ServeConfig class MegatronAsyncLLM(_MegatronLLMBase): @@ -26,11 +26,8 @@ class MegatronAsyncLLM(_MegatronLLMBase): raise :class:`RuntimeError`; pass a list of prompts to batch. - ``async`` lifecycle controls: ``pause`` / ``unpause`` / ``suspend`` / ``resume`` / ``shutdown`` / ``wait_for_shutdown``. + - :meth:`serve` for OpenAI-compatible HTTP serving on the primary rank. - ``async with`` context-manager protocol; exit calls :meth:`shutdown`. - - Note: - ``serve()`` (online HTTP serving) is not yet implemented and will be - added in a later stage. """ def __init__( @@ -54,6 +51,8 @@ def __init__( # Concurrency guard for direct-mode generate (asyncio is single-threaded; # a plain bool is sufficient). self._direct_generate_in_flight: bool = False + # Set in serve() when this rank starts the HTTP frontend; consulted by shutdown(). + self._serve_started: bool = False async def generate( self, @@ -155,12 +154,80 @@ async def shutdown(self) -> None: if self._shutdown_called: return self._shutdown_called = True + + # If we started an HTTP frontend, stop it first so no new requests + # arrive while we tear down the coordinator. Invariant: + # ``_serve_started`` can only be True when ``use_coordinator=True`` + # because ``serve()`` raises otherwise. + if self._serve_started: + from megatron.core.inference.text_generation_server.dynamic_text_gen_server.text_generation_server import ( + stop_text_gen_server, + ) + + stop_text_gen_server() + self._serve_started = False + if not self._use_coordinator: return assert self._loop_manager is not None await self._loop_manager.run_async(self._shutdown_impl()) - # Stop the loop in a worker thread so we don't block the caller's loop. - await asyncio.to_thread(self._loop_manager.stop) + self._loop_manager.stop() + + async def serve( + self, + serve_config: ServeConfig, + *, + blocking: bool = True, + ) -> None: + """Start the OpenAI-compatible HTTP frontend. + + Coordinator mode only. The HTTP frontend runs only on the primary + rank (global rank 0); other ranks no-op the HTTP setup but still + respect ``blocking`` (so all ranks return together). + + With ``blocking=True`` (default), this awaits the engine loop until + :meth:`shutdown` is called -- suitable for standalone serving scripts. + With ``blocking=False``, this returns once the HTTP frontend is up + (primary) or immediately (workers); the engine loop continues in the + background runtime, and the user can call :meth:`generate` / + :meth:`shutdown` afterward. + + Raises: + ValueError: if ``use_coordinator=False`` (HTTP serving requires + the coordinator path). + """ + if not self._use_coordinator: + raise ValueError( + "MegatronAsyncLLM.serve() requires use_coordinator=True" + ) + + if self._is_primary_rank: + # Lazy import: keep the module importable in environments where + # the HTTP server backend (Quart/Hypercorn) isn't installed. + import torch.distributed as dist + + from megatron.core.inference.text_generation_server.dynamic_text_gen_server.text_generation_server import ( + start_text_gen_server, + ) + + assert self._coord_runtime is not None + start_text_gen_server( + coordinator_addr=self._coord_runtime.coord_addr, + tokenizer=self._controller.tokenizer, + rank=dist.get_rank(), + server_port=serve_config.port, + parsers=serve_config.parsers, + verbose=serve_config.verbose, + num_replicas=serve_config.frontend_replicas, + hostname=serve_config.host, + ) + self._serve_started = True + + if blocking: + # Block until the engine loop terminates (shutdown was invoked + # somewhere in this process; for serve(blocking=True) typically by + # SIGINT or out-of-band orchestration). + await self.wait_for_shutdown() async def wait_for_shutdown(self) -> None: """Block until the engine's background loop task terminates. diff --git a/megatron/inference/serve_config.py b/megatron/inference/serve_config.py index ff6a4ad209a..aa7c6afe8fd 100644 --- a/megatron/inference/serve_config.py +++ b/megatron/inference/serve_config.py @@ -1,7 +1,6 @@ # Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. from dataclasses import dataclass, field -from typing import Literal @dataclass @@ -27,16 +26,6 @@ class ServeConfig: port: int = 5000 """HTTP bind port for the OpenAI-compatible frontend.""" - role: Literal["primary", "worker", "auto"] = "auto" - """Per-rank role selector for the serving frontend. - - - ``"primary"``: this rank exposes the HTTP frontend. - - ``"worker"``: this rank does not expose HTTP; it participates in the - dynamic engine loop only. - - ``"auto"``: automatically picks ``"primary"`` on global rank 0 and - ``"worker"`` elsewhere. - """ - parsers: list[str] = field(default_factory=list) """Response parser names to enable on the HTTP frontend. From b6448e83e59c8e701c03f7454d00e06d5ff3881c Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Thu, 7 May 2026 16:25:39 -0700 Subject: [PATCH 07/15] feat(inference): add offline_inference example, fix high-level API bugs Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/inference/README.md | 17 +- .../gpt/gpt_dynamic_inference_12b.sh | 127 -------- .../gpt/gpt_dynamic_inference_357m.sh | 115 -------- .../{gpt => legacy}/gpt_dynamic_inference.py | 0 .../gpt_dynamic_inference_with_coordinator.py | 0 .../{gpt => legacy}/gpt_static_inference.py | 0 .../simple_t5_batch_inference.py | 0 .../llama_mistral/huggingface_reference.py | 25 -- .../run_static_inference_llama4_scout.sh | 68 ----- .../run_text_generation_llama3.1.sh | 56 ---- .../run_text_generation_llama3.sh | 55 ---- .../run_text_generation_mistral.sh | 53 ---- examples/inference/offline_inference.py | 278 ++++++++++++++++++ examples/inference/run_offline_inference.sh | 106 +++++++ .../run_text_generation_server_345M.sh | 31 -- ...eneration_server_345M_8_tensor_parallel.sh | 29 -- examples/inference/{gpt => }/utils.py | 108 +++++++ megatron/inference/_llm_base.py | 18 +- 18 files changed, 520 insertions(+), 566 deletions(-) delete mode 100644 examples/inference/gpt/gpt_dynamic_inference_12b.sh delete mode 100644 examples/inference/gpt/gpt_dynamic_inference_357m.sh rename examples/inference/{gpt => legacy}/gpt_dynamic_inference.py (100%) rename examples/inference/{gpt => legacy}/gpt_dynamic_inference_with_coordinator.py (100%) rename examples/inference/{gpt => legacy}/gpt_static_inference.py (100%) rename examples/inference/{t5 => legacy}/simple_t5_batch_inference.py (100%) delete mode 100644 examples/inference/llama_mistral/huggingface_reference.py delete mode 100755 examples/inference/llama_mistral/run_static_inference_llama4_scout.sh delete mode 100755 examples/inference/llama_mistral/run_text_generation_llama3.1.sh delete mode 100755 examples/inference/llama_mistral/run_text_generation_llama3.sh delete mode 100755 examples/inference/llama_mistral/run_text_generation_mistral.sh create mode 100644 examples/inference/offline_inference.py create mode 100644 examples/inference/run_offline_inference.sh delete mode 100755 examples/inference/run_text_generation_server_345M.sh delete mode 100755 examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh rename examples/inference/{gpt => }/utils.py (70%) diff --git a/examples/inference/README.md b/examples/inference/README.md index 3259bf7f943..290b07440ab 100644 --- a/examples/inference/README.md +++ b/examples/inference/README.md @@ -1,5 +1,20 @@ ### Megatron Core Inference Documentation -This guide provides an example for Megatron Core for running model inference. +This guide provides an example for Megatron Core for running model inference. + +### Folder structure +``` +examples/inference + legacy/ + gpt_dynamic_inference_with_coordinator.py + gpt_dynamic_inference.py + gpt_static_inference.py + simple_t5_batch_inference.py + offline_inference.py + launch_inference_server.py + utils.py + run_offline_inference.sh + run_inference_server.sh +``` ### Contents - [Megatron Core Inference Documentation](#megatron-core-inference-documentation) diff --git a/examples/inference/gpt/gpt_dynamic_inference_12b.sh b/examples/inference/gpt/gpt_dynamic_inference_12b.sh deleted file mode 100644 index ca21bb170a5..00000000000 --- a/examples/inference/gpt/gpt_dynamic_inference_12b.sh +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - -# Run dynamic batching inference on the 12B GPT model. - -set -u - -# Libraries. -pip install simpy -pip install sentencepiece -pip install tiktoken - -# Environment variables. -export CUDA_DEVICE_MAX_CONNECTIONS=1 - -# Checkpoint. -: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"} -: ${TOKENIZER_MODEL:?"TOKENIZER_MODEL is not set"} - -# Prompts. -: ${NUM_TOKENS_TO_PROMPT="8 32"} -: ${NUM_TOKENS_TO_GENERATE=256} -: ${INCOMING_REQUESTS_DURATION=10.} -: ${INCOMING_REQUESTS_PER_SEC=100.} - -# Dynamic context. -: ${BUFFER_SIZE_GB=50.} - -# Cuda graphs. -: ${NUM_CUDA_GRAPHS=16} - -# Miscellaneous. -: ${USE_COORDINATOR=0} -: ${ENGINE=dynamic} -: ${EXTRA_ARGS=""} -# NSIGHT_PREFIX=/path/to/nsight/profile - -# Arguments. -ARGS=" \ - --no-persist-layer-norm \ - --apply-layernorm-1p \ - --no-position-embedding \ - --group-query-attention \ - --num-query-groups 8 \ - --load ${CHECKPOINT_DIR} \ - --use-checkpoint-args \ - --untie-embeddings-and-output-weights \ - --disable-bias-linear \ - --use-rotary-position-embeddings \ - --position-embedding-type rope \ - --rotary-base 1000000 \ - --rotary-percent 1.0 \ - --swiglu \ - --normalization RMSNorm \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --exit-duration-in-mins 5740 \ - --tensor-model-parallel-size 1 \ - --pipeline-model-parallel-size 1 \ - --num-layers 40 \ - --hidden-size 5120 \ - --ffn-hidden-size 14336 \ - --num-attention-heads 32 \ - --kv-channels 128 \ - --seq-length 1024 \ - --max-position-embeddings 1024 \ - --micro-batch-size 64 \ - --bf16 \ - --tokenizer-type TikTokenizer \ - --tiktoken-pattern v2 \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --distributed-timeout-minutes 2400 \ - --use-flash-attn \ - --inference-rng-tracker \ - \ - --inference-dynamic-batching \ - --inference-dynamic-batching-buffer-size-gb ${BUFFER_SIZE_GB} \ - \ - ${EXTRA_ARGS} \ -" - -# Cuda graphs. -if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then - ARGS+=" \ - --cuda-graph-impl local \ - --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \ - " -else - ARGS+=" \ - --cuda-graph-impl none \ - " -fi - -# Prompts. -if [[ -v PROMPTS ]]; then - ARGS+=" \ - --prompts ${PROMPTS} \ - --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ - " -elif [[ -v PROMPT_FILE ]]; then - ARGS+=" \ - --prompt-file ${PROMPT_FILE} \ - --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ - " -else - ARGS+=" \ - --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \ - --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ - --incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \ - --incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \ - " -fi - -# Command. -if [[ "${USE_COORDINATOR}" == "0" ]]; then - CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}" -else - CMD="python -um examples.inference.gpt.gpt_${ENGINE}_inference_with_coordinator ${ARGS}" -fi - -if [[ -v NSIGHT_PREFIX ]]; then - CMD="nsys profile -s none -t nvtx,cuda --cudabacktrace=all --cuda-graph-trace=node --python-backtrace=cuda --wait all -o ${NSIGHT_PREFIX} --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop ${CMD}" -fi - -echo "~~~" -echo "CMD ... ${CMD}." -echo "~~~" -eval ${CMD} diff --git a/examples/inference/gpt/gpt_dynamic_inference_357m.sh b/examples/inference/gpt/gpt_dynamic_inference_357m.sh deleted file mode 100644 index cc99bdddec1..00000000000 --- a/examples/inference/gpt/gpt_dynamic_inference_357m.sh +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - -# Run dynamic batching inference on the 357M GPT model. - -set -u - -# Libraries. -pip install simpy -pip install sentencepiece -pip install tiktoken - -# Environment variables. -export CUDA_DEVICE_MAX_CONNECTIONS=1 - -# Checkpoint. -: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"} -: ${VOCAB_FILE:?"VOCAB_FILE is not set"} -: ${MERGE_FILE:?"MERGE_FILE is not set"} - -# Prompts. -: ${NUM_TOKENS_TO_PROMPT="8 32"} -: ${NUM_TOKENS_TO_GENERATE=256} -: ${INCOMING_REQUESTS_DURATION=10.} -: ${INCOMING_REQUESTS_PER_SEC=100.} - -# Dynamic context. -: ${BUFFER_SIZE_GB=50.} - -# Cuda graphs. -: ${NUM_CUDA_GRAPHS=16} - -# Miscellaneous. -: ${USE_COORDINATOR=0} -: ${ENGINE=dynamic} -: ${NPROC_PER_NODE=1} -: ${EXTRA_ARGS=""} -# NSIGHT_PREFIX=/path/to/nsight/profile - -# Arguments. -ARGS=" \ - --exit-on-missing-checkpoint \ - --transformer-impl local \ - --load ${CHECKPOINT_DIR} \ - --tokenizer-type GPT2BPETokenizer \ - --vocab-file ${VOCAB_FILE} \ - --merge-file ${MERGE_FILE} \ - --exit-on-missing-checkpoint \ - --max-position-embeddings 2048 \ - --seq-length 2048 \ - --tensor-model-parallel-size 1 \ - --pipeline-model-parallel-size 1 \ - --num-layers 24 \ - --num-attention-heads 16 \ - --hidden-size 1024 \ - --bf16 \ - --micro-batch-size 1 \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --seed 42 \ - --use-flash-attn \ - --inference-rng-tracker \ - \ - --inference-dynamic-batching \ - --inference-dynamic-batching-buffer-size-gb ${BUFFER_SIZE_GB} \ - \ - ${EXTRA_ARGS} \ -" - -# Cuda graphs. -if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then - ARGS+=" \ - --cuda-graph-impl local \ - --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \ - " -else - ARGS+=" \ - --cuda-graph-impl none \ - " -fi - -# Prompts. -if [[ -v PROMPTS ]]; then - ARGS+=" \ - --prompts ${PROMPTS} \ - --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ - " -elif [[ -v PROMPT_FILE ]]; then - ARGS+=" \ - --prompt-file ${PROMPT_FILE} \ - --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ - " -else - ARGS+=" \ - --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \ - --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \ - --incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \ - --incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \ - " -fi - -# Command. -if [[ "${USE_COORDINATOR}" == "0" ]]; then - CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}" -else - CMD="python -m torch.distributed.run --nproc-per-node ${NPROC_PER_NODE} -m examples.inference.gpt.gpt_${ENGINE}_inference_with_coordinator ${ARGS}" -fi - -if [[ -v NSIGHT_PREFIX ]]; then - CMD="nsys profile -s none -t nvtx,cuda --cudabacktrace=all --cuda-graph-trace=node --python-backtrace=cuda --wait all -o ${NSIGHT_PREFIX} --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop ${CMD}" -fi - -echo "~~~" -echo "CMD ... ${CMD}." -echo "~~~" -eval ${CMD} diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/legacy/gpt_dynamic_inference.py similarity index 100% rename from examples/inference/gpt/gpt_dynamic_inference.py rename to examples/inference/legacy/gpt_dynamic_inference.py diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/legacy/gpt_dynamic_inference_with_coordinator.py similarity index 100% rename from examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py rename to examples/inference/legacy/gpt_dynamic_inference_with_coordinator.py diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/legacy/gpt_static_inference.py similarity index 100% rename from examples/inference/gpt/gpt_static_inference.py rename to examples/inference/legacy/gpt_static_inference.py diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/legacy/simple_t5_batch_inference.py similarity index 100% rename from examples/inference/t5/simple_t5_batch_inference.py rename to examples/inference/legacy/simple_t5_batch_inference.py diff --git a/examples/inference/llama_mistral/huggingface_reference.py b/examples/inference/llama_mistral/huggingface_reference.py deleted file mode 100644 index 9d8f4465f65..00000000000 --- a/examples/inference/llama_mistral/huggingface_reference.py +++ /dev/null @@ -1,25 +0,0 @@ -import argparse -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - -# Set up argument parsing -parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.") -parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation") -parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint") - -# Parse command-line arguments -args = parser.parse_args() - -model_path = args.model_path -prompt = args.prompt - -config = AutoConfig.from_pretrained(model_path) -tokenizer = AutoTokenizer.from_pretrained(model_path, config=config) -model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda() - -inputs = tokenizer(prompt, return_tensors="pt") -for key in inputs: - inputs[key] = inputs[key].cuda() -# top_k, top_p and do_sample are set for greedy argmax based sampling - -outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0) -print(tokenizer.decode(outputs[0], skip_special_tokens=True)) \ No newline at end of file diff --git a/examples/inference/llama_mistral/run_static_inference_llama4_scout.sh b/examples/inference/llama_mistral/run_static_inference_llama4_scout.sh deleted file mode 100755 index cc8cfac5e69..00000000000 --- a/examples/inference/llama_mistral/run_static_inference_llama4_scout.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NVTE_APPLY_QK_LAYER_SCALING=0 - -DISTRIBUTED_ARGS="--nproc_per_node 8 \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr 0.0.0.0 \ - --master_port 6000" - -# Fill in checkpoint path to Llama 4 Scout to run -CHECKPOINT= -PROMPTS="What is the capital of France?" -TOKENS_TO_GENERATE=4 -MAX_BATCH_SIZE=2 - -MODEL_ARGS=" \ - --micro-batch-size 1 \ - --bf16 \ - --no-masked-softmax-fusion \ - --disable-bias-linear \ - --untie-embeddings-and-output-weights \ - --position-embedding-type rope \ - --no-rope-fusion \ - --normalization RMSNorm \ - --swiglu \ - --num-layers 48 \ - --hidden-size 5120 \ - --ffn-hidden-size 16384 \ - --num-attention-heads 40 \ - --group-query-attention \ - --num-query-groups 8 \ - --qk-layernorm \ - --num-experts 16 \ - --moe-ffn-hidden-size 8192 \ - --moe-router-score-function sigmoid \ - --moe-router-topk 1 \ - --moe-router-topk-scaling-factor 1.0 \ - --moe-shared-expert-intermediate-size 8192 \ - --moe-aux-loss-coeff 1e-3 \ - --moe-token-dispatcher-type alltoall \ - --moe-token-drop-policy probs \ - --moe-router-load-balancing-type seq_aux_loss \ - --seq-length 4096 \ - --max-position-embeddings 4096 \ - --tokenizer-type HuggingFaceTokenizer \ - --make-vocab-size-divisible-by 128 \ - --use-mcore-models \ - --rotary-interleaved \ - --rotary-percent 1.0 \ - --rotary-base 500000 \ - --rope-scaling-factor 8.0 \ - --use-rope-scaling \ - --no-bias-swiglu-fusion \ - --qk-l2-norm \ - --moe-apply-probs-on-input \ - --moe-router-dtype fp64 \ -" - -torchrun $DISTRIBUTED_ARGS -m examples.inference.gpt.gpt_static_inference \ - --load ${CHECKPOINT} \ - --tokenizer-model unsloth/Llama-4-Scout-17B-16E-Instruct \ - --dist-ckpt-strictness log_unexpected \ - --tensor-model-parallel-size 8 \ - --prompts ${PROMPTS} \ - --num-tokens-to-generate ${TOKENS_TO_GENERATE} \ - --max-batch-size ${MAX_BATCH_SIZE} \ - ${MODEL_ARGS} diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh deleted file mode 100755 index 06584f0917d..00000000000 --- a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -# This example will start serving the Llama3.1-8B model -export NCCL_IB_SL=1 -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NVTE_APPLY_QK_LAYER_SCALING=0 - -DISTRIBUTED_ARGS="--nproc_per_node 1 \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr 0.0.0.0 \ - --master_port 6000" - -# Ensure CHECKPOINT and TOKENIZER_MODEL are provided -if [ -z "$1" ] || [ -z "$2" ]; then - echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." - echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" - exit 1 -fi - -# Assign command-line arguments to variables -CHECKPOINT=$1 -TOKENIZER_MODEL=$2 - -pip install flask-restful - -torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ - --use-checkpoint-args \ - --disable-bias-linear \ - --tokenizer-type HuggingFaceTokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --transformer-impl transformer_engine \ - --normalization RMSNorm \ - --group-query-attention \ - --num-query-groups 8 \ - --no-masked-softmax-fusion \ - --attention-softmax-in-fp32 \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --untie-embeddings-and-output-weights \ - --position-embedding-type rope \ - --rotary-percent 1.0 \ - --rotary-base 500000 \ - --use-rope-scaling \ - --use-rotary-position-embeddings \ - --swiglu \ - --tensor-model-parallel-size 1 \ - --pipeline-model-parallel-size 1 \ - --num-layers 32 \ - --hidden-size 4096 \ - --ffn-hidden-size 14336 \ - --load ${CHECKPOINT} \ - --num-attention-heads 32 \ - --max-position-embeddings 131072 \ - --bf16 \ - --micro-batch-size 1 \ - --seq-length 8192 diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.sh b/examples/inference/llama_mistral/run_text_generation_llama3.sh deleted file mode 100755 index c5fc4103ab5..00000000000 --- a/examples/inference/llama_mistral/run_text_generation_llama3.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -# This example will start serving the Llama3-8B model -export NCCL_IB_SL=1 -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NVTE_APPLY_QK_LAYER_SCALING=0 - -DISTRIBUTED_ARGS="--nproc_per_node 1 \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr 0.0.0.0 \ - --master_port 6000" - -# Ensure CHECKPOINT and TOKENIZER_MODEL are provided -if [ -z "$1" ] || [ -z "$2" ]; then - echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." - echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" - exit 1 -fi - -# Assign command-line arguments to variables -CHECKPOINT=$1 -TOKENIZER_MODEL=$2 - -pip install flask-restful - -torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ - --use-checkpoint-args \ - --disable-bias-linear \ - --tokenizer-type HuggingFaceTokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --transformer-impl transformer_engine \ - --normalization RMSNorm \ - --group-query-attention \ - --num-query-groups 8 \ - --no-masked-softmax-fusion \ - --attention-softmax-in-fp32 \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --untie-embeddings-and-output-weights \ - --position-embedding-type rope \ - --rotary-percent 1.0 \ - --rotary-base 500000 \ - --use-rotary-position-embeddings \ - --swiglu \ - --tensor-model-parallel-size 1 \ - --pipeline-model-parallel-size 1 \ - --num-layers 32 \ - --hidden-size 4096 \ - --ffn-hidden-size 14336 \ - --load ${CHECKPOINT} \ - --num-attention-heads 32 \ - --max-position-embeddings 8192 \ - --bf16 \ - --micro-batch-size 1 \ - --seq-length 8192 diff --git a/examples/inference/llama_mistral/run_text_generation_mistral.sh b/examples/inference/llama_mistral/run_text_generation_mistral.sh deleted file mode 100755 index 4358fd494c7..00000000000 --- a/examples/inference/llama_mistral/run_text_generation_mistral.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# This example will start serving the Mistral-7B-v0.3 model -export NCCL_IB_SL=1 -export CUDA_DEVICE_MAX_CONNECTIONS=1 - -DISTRIBUTED_ARGS="--nproc_per_node 1 \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr 0.0.0.0 \ - --master_port 6000" - -# Ensure CHECKPOINT and TOKENIZER_MODEL are provided -if [ -z "$1" ] || [ -z "$2" ]; then - echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments." - echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model" - exit 1 -fi - -# Assign command-line arguments to variables -CHECKPOINT=$1 -TOKENIZER_MODEL=$2 - -pip install flask-restful - -torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ - --tokenizer-type HuggingFaceTokenizer \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --use-checkpoint-args \ - --apply-layernorm-1p \ - --transformer-impl transformer_engine \ - --normalization RMSNorm \ - --group-query-attention \ - --num-query-groups 8 \ - --no-masked-softmax-fusion \ - --use-flash-attn \ - --untie-embeddings-and-output-weights \ - --disable-bias-linear \ - --position-embedding-type rope \ - --rotary-percent 1.0 \ - --rotary-base 1000000 \ - --swiglu \ - --ffn-hidden-size 14336 \ - --tensor-model-parallel-size 1 \ - --pipeline-model-parallel-size 1 \ - --num-layers 32 \ - --hidden-size 4096 \ - --load ${CHECKPOINT} \ - --num-attention-heads 32 \ - --max-position-embeddings 4096 \ - --bf16 \ - --micro-batch-size 1 \ - --seq-length 4096 \ - --seed 101 diff --git a/examples/inference/offline_inference.py b/examples/inference/offline_inference.py new file mode 100644 index 00000000000..cb5747f8418 --- /dev/null +++ b/examples/inference/offline_inference.py @@ -0,0 +1,278 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +"""Offline inference example using the Megatron high-level API. + +Mirrors examples/inference/legacy/gpt_dynamic_inference.py but drives the +``DynamicInferenceEngine`` through ``MegatronLLM`` (sync) or +``MegatronAsyncLLM`` (async, via ``--async-mode``) instead of the manual +add_request/step_modern loop. Output format (setup prefix, unique prompt +blocks, throughput line, optional JSON dump) matches the legacy script. + +Run modes are selected at the CLI: + + # sync, direct (default) + python -m examples.inference.offline_inference --load ... + + # sync, coordinator + python -m examples.inference.offline_inference --load --use-coordinator ... + + # async (with or without --use-coordinator) + python -m examples.inference.offline_inference --load --async-mode ... +""" + +import asyncio +import logging +import os +import sys +from argparse import ArgumentParser + +import torch +import torch.distributed as dist + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) + +from examples.inference.utils import ( + build_dynamic_engine_setup_prefix, + build_requests, + dump_inference_results_to_json, + get_curr_time, + get_global_peak_memory_stats_bytes, + print_unique_prompts_and_outputs, +) +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer +from megatron.core.utils import configure_nvtx_profiling +from megatron.inference import MegatronAsyncLLM, MegatronLLM +from megatron.inference.utils import ( + add_inference_args, + get_inference_config_from_model_and_args, + get_model_for_inference, +) +from megatron.training import initialize_megatron +from megatron.training.arguments import parse_and_validate_args + + +def add_offline_inference_args(parser: ArgumentParser) -> ArgumentParser: + parser = add_inference_args(parser) + group = parser.add_argument_group(title='Offline inference (high-level API)') + group.add_argument("--use-coordinator", action="store_true", default=False) + group.add_argument("--coordinator-host", type=str, default=None) + group.add_argument("--coordinator-port", type=int, default=None) + group.add_argument( + "--async-mode", + action="store_true", + default=False, + help="Drive MegatronAsyncLLM via asyncio.run instead of MegatronLLM.", + ) + return parser + + +def _validate_high_level_api_args(args): + # engine.reset() between trials races the runtime engine loop in + # coordinator mode (engine_loop_task runs on the runtime thread). + if args.use_coordinator and args.inference_repeat_n > 1: + raise ValueError( + "--use-coordinator with --inference-repeat-n > 1 is not supported: " + "engine.reset() races the runtime engine loop in coordinator mode." + ) + # The high-level API takes one sampling_params per generate() call. + if args.prompt_file and getattr(args, "num_tokens_from_file", False): + raise ValueError( + "--prompt-file with --num-tokens-from-file produces per-request " + "num_tokens_to_generate, but the high-level API takes one " + "sampling_params per generate() call. Use a uniform " + "--num-tokens-to-generate instead." + ) + + +def _validate_prompt_lengths(args, llm, requests): + # Validate prompt lengths against the resolved max_tokens (default + # is filled in by DynamicInferenceContext during construction). + if args.enable_chunked_prefill: + return + invalid = { + idx: len(r.prompt_tokens) + for idx, r in enumerate(requests) + if len(r.prompt_tokens) > llm.context.max_tokens + } + assert not invalid, ( + "request idxs with prompts longer than context.max_tokens: " + ", ".join(f"{k}({v})" for k, v in invalid.items()) + ) + + +def _capture_engine_stats(llm) -> dict: + return { + "step_count": llm.engine.context.step_count, + "lifetime_prefill_token_count": llm.engine.context.lifetime_prefill_token_count, + "capture_stats": llm.engine.capture_stats, + } + + +def _print_setup_prefix(setup_prefix: str) -> None: + if dist.get_rank() == 0: + print("~~~") + print(setup_prefix) + print("~~~") + + +def _report_results( + args, setup_prefix, results, throughputs, total_time, peak_mem_stats, captured +): + if dist.get_rank() != 0: + return + + print_unique_prompts_and_outputs(results) + dump_inference_results_to_json( + args, + results, + throughputs, + peak_mem_stats, + captured["step_count"], + captured["lifetime_prefill_token_count"], + ) + + stats = torch.cuda.memory_stats() + peak_alloc_gb = stats["allocated_bytes.all.peak"] / 1024**3 + peak_resvd_gb = stats["reserved_bytes.all.peak"] / 1024**3 + throughput = throughputs[-1] if throughputs else 0.0 + capture_str = ( + f"{captured['capture_stats']['time']:.2f} sec" + if captured["capture_stats"] + else "--" + ) + print("~~~") + print( + f"{setup_prefix} … " f"throughput: {throughput:.3f} tok/s … ", + f"total time: {total_time:.3f}s … " + f"mem {peak_alloc_gb:.1f}/{peak_resvd_gb:.1f} GB … " + f"steps: {captured['step_count']:d} … " + f"capture {capture_str}", + ) + print("~~~") + + +def _run_sync(args, model, tokenizer, inference_config, requests, prompts_list, sampling_params): + results = [] + throughputs = [] + total_time = 0.0 + captured = {"step_count": 0, "lifetime_prefill_token_count": 0, "capture_stats": None} + setup_prefix = "" + + with MegatronLLM( + model=model, + tokenizer=tokenizer, + inference_config=inference_config, + use_coordinator=args.use_coordinator, + coordinator_host=args.coordinator_host, + coordinator_port=args.coordinator_port, + ) as llm: + setup_prefix = build_dynamic_engine_setup_prefix(args, model, llm.context, requests) + _validate_prompt_lengths(args, llm, requests) + + # Coordinator mode: only the primary rank submits work; worker ranks + # fall through and block in __exit__ until shutdown propagates STOP. + if llm.is_primary_rank: + _print_setup_prefix(setup_prefix) + for trial_idx in range(args.inference_repeat_n): + # Skip first-trial reset; the engine is fresh post-construction. + if trial_idx > 0: + llm.engine.reset() + torch.cuda.reset_peak_memory_stats() + + t = get_curr_time() + results = llm.generate(prompts_list, sampling_params) + torch.cuda.synchronize() + total_time = get_curr_time() - t + + total_output_tokens = sum(len(r.generated_tokens) for r in results) + throughputs.append(total_output_tokens / total_time) + captured = _capture_engine_stats(llm) + + # Engine is shut down on all ranks; safe to all-reduce peak-memory now. + peak_mem_stats = get_global_peak_memory_stats_bytes() + _report_results(args, setup_prefix, results, throughputs, total_time, peak_mem_stats, captured) + + +async def _run_async( + args, model, tokenizer, inference_config, requests, prompts_list, sampling_params +): + results = [] + throughputs = [] + total_time = 0.0 + captured = {"step_count": 0, "lifetime_prefill_token_count": 0, "capture_stats": None} + setup_prefix = "" + + async with MegatronAsyncLLM( + model=model, + tokenizer=tokenizer, + inference_config=inference_config, + use_coordinator=args.use_coordinator, + coordinator_host=args.coordinator_host, + coordinator_port=args.coordinator_port, + ) as llm: + setup_prefix = build_dynamic_engine_setup_prefix(args, model, llm.context, requests) + _validate_prompt_lengths(args, llm, requests) + + if llm.is_primary_rank: + _print_setup_prefix(setup_prefix) + for trial_idx in range(args.inference_repeat_n): + if trial_idx > 0: + llm.engine.reset() + torch.cuda.reset_peak_memory_stats() + + t = get_curr_time() + results = await llm.generate(prompts_list, sampling_params) + torch.cuda.synchronize() + total_time = get_curr_time() - t + + total_output_tokens = sum(len(r.generated_tokens) for r in results) + throughputs.append(total_output_tokens / total_time) + captured = _capture_engine_stats(llm) + + peak_mem_stats = get_global_peak_memory_stats_bytes() + _report_results(args, setup_prefix, results, throughputs, total_time, peak_mem_stats, captured) + + +def main(): + args = parse_and_validate_args( + extra_args_provider=add_offline_inference_args, + args_defaults={'no_load_rng': True, 'no_load_optim': True}, + ) + initialize_megatron() + _validate_high_level_api_args(args) + + if os.environ.get("NSIGHT_PREFIX"): + torch.cuda.cudart().cudaProfilerStart() + + level = getattr(logging, os.getenv("LOG_LEVEL", "INFO").upper(), logging.INFO) + logging.basicConfig(level=level, force=True) + configure_nvtx_profiling(True) + + tokenizer = build_tokenizer(args) + torch.cuda.reset_peak_memory_stats() + + model = get_model_for_inference() + inference_config = get_inference_config_from_model_and_args(model, args) + requests = build_requests(args, tokenizer, sampling_params=None) + sampling_params = requests[0].sampling_params + + max_gen_length = sampling_params.num_tokens_to_generate + max_context_length = max(len(r.prompt_tokens) for r in requests) + inference_config.max_sequence_length = max_context_length + max_gen_length + + prompts_list = [r.prompt_text for r in requests] + + runner_args = (args, model, tokenizer, inference_config, requests, prompts_list, sampling_params) + if args.async_mode: + asyncio.run(_run_async(*runner_args)) + else: + _run_sync(*runner_args) + + if os.environ.get("NSIGHT_PREFIX"): + torch.cuda.cudart().cudaProfilerStop() + + +if __name__ == "__main__": + main() diff --git a/examples/inference/run_offline_inference.sh b/examples/inference/run_offline_inference.sh new file mode 100644 index 00000000000..b34c832ded2 --- /dev/null +++ b/examples/inference/run_offline_inference.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# Offline inference launcher for the Megatron high-level API examples. +# +# Requires `simpy` (used by examples/inference/utils.py for synthetic request +# arrival simulation). If it is not already installed: +# pip install simpy +# +# Required CLI args: +# --hf-token Hugging Face token for tokenizer downloads. +# --checkpoint Path to the Megatron checkpoint passed as --load. +# +# Optional env vars (mode switching): +# MODE=sync|async (default: sync) +# USE_COORDINATOR=0|1 (default: 0) +# NPROC= (default: 1) +# +# Invocations: +# sync + direct (default): +# bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt +# sync + coordinator: +# USE_COORDINATOR=1 bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt +# async + direct: +# MODE=async bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt +# async + coordinator: +# MODE=async USE_COORDINATOR=1 bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt + +HF_TOKEN="" +CHECKPOINT="" +while [[ $# -gt 0 ]]; do + case "$1" in + --hf-token) + HF_TOKEN="$2" + shift 2 + ;; + --checkpoint) + CHECKPOINT="$2" + shift 2 + ;; + -h|--help) + sed -n '2,20p' "$0" + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + echo "Run with -h for usage." >&2 + exit 1 + ;; + esac +done + +if [[ -z "$HF_TOKEN" ]]; then + echo "Error: --hf-token is required" >&2 + exit 1 +fi +if [[ -z "$CHECKPOINT" ]]; then + echo "Error: --checkpoint is required" >&2 + exit 1 +fi +export HF_TOKEN + +MODE="${MODE:-sync}" +USE_COORDINATOR="${USE_COORDINATOR:-0}" +NPROC="${NPROC:-1}" + +if [[ "$MODE" != "sync" && "$MODE" != "async" ]]; then + echo "Invalid MODE='$MODE'; expected 'sync' or 'async'." >&2 + exit 1 +fi + +ENTRY="examples.inference.offline_inference" + +EXTRA_ARGS="" +if [[ "$USE_COORDINATOR" == "1" ]]; then + EXTRA_ARGS="$EXTRA_ARGS --use-coordinator" +fi +if [[ "$MODE" == "async" ]]; then + EXTRA_ARGS="$EXTRA_ARGS --async-mode" +fi + +torchrun --nproc-per-node "$NPROC" \ + -m "$ENTRY" $EXTRA_ARGS \ + --load "$CHECKPOINT" \ + --bf16 \ + --tensor-model-parallel-size 1 \ + --micro-batch-size 64 \ + --dist-ckpt-strictness log_unexpected \ + --inference-rng-tracker \ + --cuda-graph-impl local \ + --decode-only-cuda-graphs \ + --tokenizer-type HuggingFaceTokenizer \ + --tokenizer-model Qwen/Qwen2.5-1.5B \ + --no-use-tokenizer-model-from-checkpoint-args \ + --num-layers 28 \ + --hidden-size 1536 \ + --num-attention-heads 12 \ + --max-position-embeddings 32768 \ + --num-query-groups 2 \ + --group-query-attention \ + --swiglu \ + --normalization RMSNorm \ + --disable-bias-linear \ + --position-embedding-type rope \ + --rotary-percent 1.0 \ + --rotary-base 1000000 \ + --seq-length 32768 \ + --ffn-hidden-size 8960 diff --git a/examples/inference/run_text_generation_server_345M.sh b/examples/inference/run_text_generation_server_345M.sh deleted file mode 100755 index e8e61adb163..00000000000 --- a/examples/inference/run_text_generation_server_345M.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -# This example will start serving the 345M model. -DISTRIBUTED_ARGS="--nproc_per_node 1 \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr localhost \ - --master_port 6000" - -CHECKPOINT= -VOCAB_FILE= -MERGE_FILE= - -export CUDA_DEVICE_MAX_CONNECTIONS=1 - -pip install flask-restful - -torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ - --tensor-model-parallel-size 1 \ - --pipeline-model-parallel-size 1 \ - --num-layers 24 \ - --hidden-size 1024 \ - --load ${CHECKPOINT} \ - --num-attention-heads 16 \ - --max-position-embeddings 1024 \ - --tokenizer-type GPT2BPETokenizer \ - --fp16 \ - --micro-batch-size 1 \ - --seq-length 1024 \ - --vocab-file $VOCAB_FILE \ - --merge-file $MERGE_FILE \ - --seed 42 diff --git a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh deleted file mode 100755 index 368cec3b312..00000000000 --- a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# This example will start serving the 345M model that is partitioned 8 way tensor parallel -DISTRIBUTED_ARGS="--nproc_per_node 8 \ - --nnodes 1 \ - --node_rank 0 \ - --master_addr localhost \ - --master_port 6000" - -CHECKPOINT= -VOCAB_FILE= -MERGE_FILE= - -pip install flask-restful - -python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ - --tensor-model-parallel-size 8 \ - --pipeline-model-parallel-size 1 \ - --num-layers 24 \ - --hidden-size 1024 \ - --load ${CHECKPOINT} \ - --num-attention-heads 16 \ - --max-position-embeddings 1024 \ - --tokenizer-type GPT2BPETokenizer \ - --fp16 \ - --micro-batch-size 1 \ - --seq-length 1024 \ - --vocab-file $VOCAB_FILE \ - --merge-file $MERGE_FILE \ - --seed 42 diff --git a/examples/inference/gpt/utils.py b/examples/inference/utils.py similarity index 70% rename from examples/inference/gpt/utils.py rename to examples/inference/utils.py index c9b1c05c544..d667f5a0f72 100644 --- a/examples/inference/gpt/utils.py +++ b/examples/inference/utils.py @@ -1,11 +1,13 @@ # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import copy +import hashlib import itertools import json import random import time from argparse import ArgumentParser, Namespace +from collections import defaultdict from functools import partial from typing import Any, List, Optional @@ -324,3 +326,109 @@ def get_global_peak_memory_stats_bytes() -> dict: torch.distributed.all_reduce(t, op=torch.distributed.ReduceOp.MAX) peak_alloc = int(t[0].item()) return {"mem-max-allocated-bytes": peak_alloc} + + +def escape_str(s: str) -> str: + return s.replace("\n", "\\n") + + +def print_unique_prompts_and_outputs(results: List["DynamicInferenceRequest"]) -> None: + """Print unique prompts and their outputs in legacy gpt_dynamic_inference.py format. + + Reads from the high-level API's ``DynamicInferenceRequest`` records returned + by ``MegatronLLM.generate`` / ``MegatronAsyncLLM.generate``. + """ + print("~~~~ Unique prompts + outputs. ~~~~") + + unique_prompt_map = defaultdict(list) + for idx, req in enumerate(results): + unique_prompt_map[req.prompt].append(idx) + + for unique_idx, (prompt_text, request_idxs) in enumerate(unique_prompt_map.items()): + prompt_len = len(results[request_idxs[0]].prompt_tokens) + print( + f"\n{unique_idx+1}/{len(unique_prompt_map)}" + f"[n {len(request_idxs)}, l {prompt_len}] {escape_str(prompt_text)}" + ) + + output_map = defaultdict(list) + for idx in request_idxs: + output_map[results[idx].generated_text].append(idx) + + for output_text, output_request_idxs in output_map.items(): + evicted = any( + event.type.name == "EVICT" + for idx in output_request_idxs + for event in results[idx].events + ) + if output_text is not None: + o_hash = hashlib.sha256((prompt_text + output_text).encode()).hexdigest()[:6] + o_len = len(results[output_request_idxs[0]].generated_tokens) + escaped_output_text = escape_str(output_text) + else: + o_hash = "--" + o_len = 0 + escaped_output_text = "--" + print( + f" >>>> [n {len(output_request_idxs)}, {o_len} tokens, hash {o_hash}" + f"{', ' if evicted else ''}] {escaped_output_text}" + ) + + +def dump_inference_results_to_json( + args: Namespace, + results: List["DynamicInferenceRequest"], + throughputs: List[float], + peak_mem_stats: dict, + step_count: int, + lifetime_prefill_token_count: int, +) -> None: + """JSON dump of per-request results matching legacy gpt_dynamic_inference.py shape. + + Reads from the high-level API's ``DynamicInferenceRequest`` records. + Note: ``latency`` is currently always ``None`` in direct mode because the + low-level engine doesn't populate it on ``DynamicInferenceRequest.merge()``; + will be populated once that field is wired up upstream. + """ + if not args.output_path: + return + + json_results = {} + for i, req in enumerate(results): + if i % args.output_every_n_results == 0 or i == len(results) - 1: + # cuda_graph_request_count_map is only populated by the legacy + # add_request/step_modern loop and is not surfaced through the + # high-level API; omitting it here. + result_dict = { + "input_prompt": req.prompt, + "generated_text": req.generated_text, + "generated_tokens": req.generated_tokens, + "latency": req.latency, + "ttft": req.ttft, + "step_count": step_count, + "top_n_logprobs": getattr(req, 'generated_top_n_logprobs', None), + "prompt_top_n_logprobs": getattr(req, 'prompt_top_n_logprobs', None), + } + if req.sampling_params.return_log_probs: + prompt_lp = getattr(req, 'prompt_log_probs', None) + generated_lp = getattr(req, 'generated_log_probs', None) + result_dict["prompt_logprobs"] = prompt_lp + result_dict["generated_logprobs"] = generated_lp + # Synthesize the legacy "logprobs" field as the concatenation, + # since DynamicInferenceRequest doesn't carry a single combined list. + if prompt_lp is not None or generated_lp is not None: + result_dict["logprobs"] = (prompt_lp or []) + (generated_lp or []) + else: + result_dict["logprobs"] = None + if args.output_request_events: + result_dict["events"] = [e.serialize() for e in req.events] + json_results[req.request_id] = result_dict + + if args.record_throughput: + json_results["throughput"] = throughputs + json_results.update(peak_mem_stats) + json_results["lifetime_prefill_token_count"] = lifetime_prefill_token_count + + print(f' Saving results to {args.output_path}') + with open(args.output_path, "w") as fp: + json.dump(json_results, fp, indent=1) diff --git a/megatron/inference/_llm_base.py b/megatron/inference/_llm_base.py index ecc56cb4329..939ad9d591f 100644 --- a/megatron/inference/_llm_base.py +++ b/megatron/inference/_llm_base.py @@ -146,7 +146,9 @@ async def setup(self, *, loop: asyncio.AbstractEventLoop) -> None: # installed when the user only needs direct mode. from megatron.core.inference.inference_client import InferenceClient - client = InferenceClient(coord_addr) + # deserialize=True returns DynamicInferenceRequest objects from + # add_request futures, matching the high-level API contract. + client = InferenceClient(coord_addr, deserialize=True) client.start(loop=loop) self._client = client @@ -354,11 +356,9 @@ async def _generate_impl( assert self._coord_runtime is not None and self._coord_runtime.client is not None futures = [self._coord_runtime.client.add_request(p, sp) for p in prompts] return list(await asyncio.gather(*futures)) - # Direct mode: ``engine.generate`` accepts ``list[str]`` or - # ``list[list[int]]``; both flow through ``engine.add_request`` which - # accepts ``Union[str, List[int], Tensor]`` despite the narrower declared - # type on ``engine.generate`` itself. TODO: widen that signature upstream. - records = await asyncio.to_thread(self._engine.generate, prompts, sp) + # TODO: replace with an upstream ``engine.async_generate`` so direct-mode + # async generate doesn't block the caller's event loop. + records = self._engine.generate(prompts, sp) return [r.merge() for r in records] async def _pause_impl(self) -> None: @@ -388,6 +388,12 @@ async def _resume_impl(self) -> None: async def _shutdown_impl(self) -> None: if self._is_primary_rank: assert self._coord_runtime is not None and self._coord_runtime.client is not None + # The coordinator only honors STOP from PAUSED or SUSPENDED. If + # the engine is RUNNING (the typical state at shutdown), pause + # first so the STOP isn't ignored. + if self._engine.state == EngineState.RUNNING: + self._coord_runtime.client.pause_engines() + await self._engine.wait_until(EngineState.PAUSED) self._coord_runtime.client.stop_engines() await self._engine.wait_until(EngineState.STOPPED) await self._coord_runtime.teardown() From 44ec5a9f4e1bae120546d5c4c4a0970d7da992a8 Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Thu, 7 May 2026 17:46:40 -0700 Subject: [PATCH 08/15] feat(inference): add launch_inference_server example, fix daemon-thread CUDA device Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/inference/launch_inference_server.py | 106 ++++++++++++++++++ examples/inference/run_inference_server.sh | 98 ++++++++++++++++ examples/inference/run_offline_inference.sh | 52 +++++---- megatron/inference/_llm_base.py | 14 +++ 4 files changed, 249 insertions(+), 21 deletions(-) create mode 100644 examples/inference/launch_inference_server.py create mode 100644 examples/inference/run_inference_server.sh diff --git a/examples/inference/launch_inference_server.py b/examples/inference/launch_inference_server.py new file mode 100644 index 00000000000..323d1563121 --- /dev/null +++ b/examples/inference/launch_inference_server.py @@ -0,0 +1,106 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +"""OpenAI-compatible inference server using the Megatron high-level API. + +Mirrors tools/run_dynamic_text_generation_server.py but drives the +``DynamicInferenceEngine`` through ``MegatronAsyncLLM.serve(...)`` instead +of building the coordinator/engine pipeline manually. Coordinator mode is +required (HTTP serving uses the coordinator path); ``use_coordinator=True`` +is hardcoded in the script. +""" + +import asyncio +import os +import sys +from argparse import ArgumentParser + +import torch + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) + +from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer +from megatron.core.utils import configure_nvtx_profiling +from megatron.inference import MegatronAsyncLLM, ServeConfig +from megatron.inference.utils import ( + add_inference_args, + get_inference_config_from_model_and_args, + get_model_for_inference, +) +from megatron.training import get_args, initialize_megatron +from megatron.training.arguments import parse_and_validate_args + + +def add_serve_args(parser: ArgumentParser) -> ArgumentParser: + parser = add_inference_args(parser) + group = parser.add_argument_group(title='High-level inference server') + group.add_argument("--coordinator-host", type=str, default=None) + group.add_argument("--coordinator-port", type=int, default=None) + group.add_argument("--host", type=str, default="0.0.0.0", help="HTTP bind host") + group.add_argument("--port", type=int, default=5000, help="HTTP bind port") + group.add_argument( + "--parsers", type=str, nargs="+", default=[], help="Response parser names" + ) + group.add_argument( + "--verbose", action="store_true", default=False, help="Per-request HTTP logging" + ) + group.add_argument( + "--frontend-replicas", type=int, default=4, + help="Number of HTTP frontend processes spawned on the primary rank.", + ) + return parser + + +async def _serve(args, model, tokenizer, inference_config): + async with MegatronAsyncLLM( + model=model, + tokenizer=tokenizer, + inference_config=inference_config, + use_coordinator=True, + coordinator_host=args.coordinator_host, + coordinator_port=args.coordinator_port, + ) as llm: + serve_config = ServeConfig( + host=args.host, + port=args.port, + parsers=args.parsers, + verbose=args.verbose, + frontend_replicas=args.frontend_replicas, + ) + await llm.serve(serve_config, blocking=True) + + +def main(): + parse_and_validate_args( + extra_args_provider=add_serve_args, + args_defaults={'no_load_rng': True, 'no_load_optim': True}, + ) + initialize_megatron() + + args = get_args() + + # Match the legacy tool's NVTX gating. + if args.profile and args.nvtx_ranges: + configure_nvtx_profiling(True) + + # Required for lm-eval loglikelihood compatibility: keeps prompt logits + # materialized so echo=True / logprob requests work end-to-end. Matches + # tools/run_dynamic_text_generation_server.py. + args.return_log_probs = True + + tokenizer = build_tokenizer(args) + model = get_model_for_inference() + inference_config = get_inference_config_from_model_and_args(model, args) + + try: + asyncio.run(_serve(args, model, tokenizer, inference_config)) + except KeyboardInterrupt: + print("Server process interrupted by user.") + finally: + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/examples/inference/run_inference_server.sh b/examples/inference/run_inference_server.sh new file mode 100644 index 00000000000..de50816f97d --- /dev/null +++ b/examples/inference/run_inference_server.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# OpenAI-compatible inference server launcher for the Megatron high-level API. +# +# Required CLI args: +# --hf-token Hugging Face token for tokenizer downloads. +# --hf-home Hugging Face cache directory. +# --checkpoint Path to the Megatron checkpoint passed as --load. +# +# Optional CLI args: +# --nproc Number of processes (default: 8). +# +# Example: +# bash run_inference_server.sh \ +# --hf-token hf_xxx \ +# --hf-home /path/to/hf_home \ +# --checkpoint /path/to/ckpt + +HF_TOKEN="" +HF_HOME="" +CHECKPOINT="" +NPROC=8 + +while [[ $# -gt 0 ]]; do + case "$1" in + --hf-token) + HF_TOKEN="$2" + shift 2 + ;; + --hf-home) + HF_HOME="$2" + shift 2 + ;; + --checkpoint) + CHECKPOINT="$2" + shift 2 + ;; + --nproc) + NPROC="$2" + shift 2 + ;; + -h|--help) + sed -n '2,16p' "$0" + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + echo "Run with -h for usage." >&2 + exit 1 + ;; + esac +done + +if [[ -z "$HF_TOKEN" ]]; then + echo "Error: --hf-token is required" >&2 + exit 1 +fi +if [[ -z "$HF_HOME" ]]; then + echo "Error: --hf-home is required" >&2 + exit 1 +fi +if [[ -z "$CHECKPOINT" ]]; then + echo "Error: --checkpoint is required" >&2 + exit 1 +fi + +export HF_TOKEN +export HF_HOME +# Required by Megatron when using tensor or context parallelism. +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +torchrun --nproc-per-node "$NPROC" \ + -m examples.inference.launch_inference_server \ + --tensor-model-parallel-size 2 \ + --expert-tensor-parallel-size 1 \ + --expert-model-parallel-size 8 \ + --sequence-parallel \ + --pipeline-model-parallel-size 1 \ + --inference-max-seq-length 4096 \ + --load "$CHECKPOINT" \ + --micro-batch-size 1 \ + --moe-router-dtype fp32 \ + --moe-token-dispatcher-type alltoall \ + --use-checkpoint-args \ + --bf16 \ + --attention-backend flash \ + --transformer-impl inference_optimized \ + --te-rng-tracker \ + --inference-rng-tracker \ + --cuda-graph-impl "local" \ + --dist-ckpt-strictness log_unexpected \ + --inference-dynamic-batching-buffer-size-gb 20 \ + --model-provider hybrid \ + --inference-dynamic-batching-max-tokens 2048 \ + --enable-chunked-prefill \ + --inference-logging-step-interval 50 \ + --inference-dynamic-batching-num-cuda-graphs -1 \ + --cuda-graph-scope full_iteration_inference \ + --inference-dynamic-batching-max-requests 256 diff --git a/examples/inference/run_offline_inference.sh b/examples/inference/run_offline_inference.sh index b34c832ded2..bb3d0b33266 100644 --- a/examples/inference/run_offline_inference.sh +++ b/examples/inference/run_offline_inference.sh @@ -6,26 +6,30 @@ # pip install simpy # # Required CLI args: -# --hf-token Hugging Face token for tokenizer downloads. -# --checkpoint Path to the Megatron checkpoint passed as --load. +# --hf-token Hugging Face token for tokenizer downloads. +# --checkpoint Path to the Megatron checkpoint passed as --load. # -# Optional env vars (mode switching): -# MODE=sync|async (default: sync) -# USE_COORDINATOR=0|1 (default: 0) -# NPROC= (default: 1) +# Optional CLI args: +# --mode sync|async Selects MegatronLLM vs MegatronAsyncLLM (default: sync). +# --use-coordinator Run in coordinator mode (default: direct). +# --nproc Number of processes (default: 8). # -# Invocations: -# sync + direct (default): +# Examples: +# sync + direct (defaults): # bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt # sync + coordinator: -# USE_COORDINATOR=1 bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt +# bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt --use-coordinator # async + direct: -# MODE=async bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt +# bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt --mode async # async + coordinator: -# MODE=async USE_COORDINATOR=1 bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt +# bash run_offline_inference.sh --hf-token hf_xxx --checkpoint /path/to/ckpt --mode async --use-coordinator HF_TOKEN="" CHECKPOINT="" +MODE="sync" +USE_COORDINATOR=0 +NPROC=8 + while [[ $# -gt 0 ]]; do case "$1" in --hf-token) @@ -36,8 +40,20 @@ while [[ $# -gt 0 ]]; do CHECKPOINT="$2" shift 2 ;; + --mode) + MODE="$2" + shift 2 + ;; + --use-coordinator) + USE_COORDINATOR=1 + shift + ;; + --nproc) + NPROC="$2" + shift 2 + ;; -h|--help) - sed -n '2,20p' "$0" + sed -n '2,26p' "$0" exit 0 ;; *) @@ -56,18 +72,12 @@ if [[ -z "$CHECKPOINT" ]]; then echo "Error: --checkpoint is required" >&2 exit 1 fi -export HF_TOKEN - -MODE="${MODE:-sync}" -USE_COORDINATOR="${USE_COORDINATOR:-0}" -NPROC="${NPROC:-1}" - if [[ "$MODE" != "sync" && "$MODE" != "async" ]]; then - echo "Invalid MODE='$MODE'; expected 'sync' or 'async'." >&2 + echo "Invalid --mode='$MODE'; expected 'sync' or 'async'." >&2 exit 1 fi -ENTRY="examples.inference.offline_inference" +export HF_TOKEN EXTRA_ARGS="" if [[ "$USE_COORDINATOR" == "1" ]]; then @@ -78,7 +88,7 @@ if [[ "$MODE" == "async" ]]; then fi torchrun --nproc-per-node "$NPROC" \ - -m "$ENTRY" $EXTRA_ARGS \ + -m examples.inference.offline_inference $EXTRA_ARGS \ --load "$CHECKPOINT" \ --bf16 \ --tensor-model-parallel-size 1 \ diff --git a/megatron/inference/_llm_base.py b/megatron/inference/_llm_base.py index 939ad9d591f..c9489f260d2 100644 --- a/megatron/inference/_llm_base.py +++ b/megatron/inference/_llm_base.py @@ -46,7 +46,21 @@ def start(self) -> None: if self._started: return + # PyTorch's CUDA current-device is thread-local and defaults to 0 on + # new threads. Capture the spawning thread's device so NCCL ops + # scheduled on the runtime loop (e.g. inside + # ``start_listening_to_data_parallel_coordinator``) hit the right GPU + # under torchrun, where every process sees all GPUs and rank-to-device + # mapping is set on the main thread only. + import torch + + parent_device = ( + torch.cuda.current_device() if torch.cuda.is_available() else None + ) + def _run_loop() -> None: + if parent_device is not None: + torch.cuda.set_device(parent_device) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) self._loop = loop From 0d8ae8b7784f662b4c65f153efe59f16c296d53c Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Thu, 7 May 2026 18:33:14 -0700 Subject: [PATCH 09/15] fix(tests): repoint inference recipes and cuda_graphs.sh to examples/inference/legacy/ Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cuda_graphs.sh | 2 +- .../recipes/h100/gpt-dynamic-inference-with-coordinator.yaml | 2 +- tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml | 2 +- tests/test_utils/recipes/h100/gpt-static-inference.yaml | 2 +- .../recipes/h100/mamba-dynamic-inference-with-coordinator.yaml | 2 +- tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml | 2 +- tests/test_utils/recipes/h100/mamba-static-inference.yaml | 2 +- .../recipes/h100/moe-dynamic-inference-with-coordinator.yaml | 2 +- tests/test_utils/recipes/h100/moe-dynamic-inference.yaml | 2 +- tests/test_utils/recipes/h100/moe-static-inference.yaml | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.sh b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.sh index 641019c9750..d7b3588e6ee 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.sh +++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_validation/cuda_graphs.sh @@ -97,7 +97,7 @@ if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then fi # Command. -CMD="python -m examples.inference.gpt.gpt_dynamic_inference ${ARGS}" +CMD="python -m examples.inference.legacy.gpt_dynamic_inference ${ARGS}" echo "~~~" echo "CMD ...${CMD}." echo "~~~" diff --git a/tests/test_utils/recipes/h100/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/h100/gpt-dynamic-inference-with-coordinator.yaml index 0349896345b..c889a3ca9f9 100644 --- a/tests/test_utils/recipes/h100/gpt-dynamic-inference-with-coordinator.yaml +++ b/tests/test_utils/recipes/h100/gpt-dynamic-inference-with-coordinator.yaml @@ -41,7 +41,7 @@ spec: "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" "DATA_PATH=/mnt/artifacts/" "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py" + "TRAINING_SCRIPT_PATH=examples/inference/legacy/gpt_dynamic_inference_with_coordinator.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" diff --git a/tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml b/tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml index 2915263c0e7..02199b5b223 100644 --- a/tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml +++ b/tests/test_utils/recipes/h100/gpt-dynamic-inference.yaml @@ -41,7 +41,7 @@ spec: "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" "DATA_PATH=/mnt/artifacts/" "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference.py" + "TRAINING_SCRIPT_PATH=examples/inference/legacy/gpt_dynamic_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" diff --git a/tests/test_utils/recipes/h100/gpt-static-inference.yaml b/tests/test_utils/recipes/h100/gpt-static-inference.yaml index 87046588b2b..e337360dc77 100644 --- a/tests/test_utils/recipes/h100/gpt-static-inference.yaml +++ b/tests/test_utils/recipes/h100/gpt-static-inference.yaml @@ -41,7 +41,7 @@ spec: "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py" + "TRAINING_SCRIPT_PATH=examples/inference/legacy/gpt_static_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "TENSORBOARD_PATH={assets_dir}/tensorboard" diff --git a/tests/test_utils/recipes/h100/mamba-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/h100/mamba-dynamic-inference-with-coordinator.yaml index 77951e97d66..ed55f10f633 100644 --- a/tests/test_utils/recipes/h100/mamba-dynamic-inference-with-coordinator.yaml +++ b/tests/test_utils/recipes/h100/mamba-dynamic-inference-with-coordinator.yaml @@ -41,7 +41,7 @@ spec: "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py" + "TRAINING_SCRIPT_PATH=examples/inference/legacy/gpt_dynamic_inference_with_coordinator.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" diff --git a/tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml b/tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml index aa78cdb8316..debd8c5369d 100644 --- a/tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml +++ b/tests/test_utils/recipes/h100/mamba-dynamic-inference.yaml @@ -41,7 +41,7 @@ spec: "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference.py" + "TRAINING_SCRIPT_PATH=examples/inference/legacy/gpt_dynamic_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" diff --git a/tests/test_utils/recipes/h100/mamba-static-inference.yaml b/tests/test_utils/recipes/h100/mamba-static-inference.yaml index b36c4a8f765..9258d6e75cf 100644 --- a/tests/test_utils/recipes/h100/mamba-static-inference.yaml +++ b/tests/test_utils/recipes/h100/mamba-static-inference.yaml @@ -41,7 +41,7 @@ spec: "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py" + "TRAINING_SCRIPT_PATH=examples/inference/legacy/gpt_static_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" diff --git a/tests/test_utils/recipes/h100/moe-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/h100/moe-dynamic-inference-with-coordinator.yaml index bc75ed1f9a6..2c560d06a5c 100644 --- a/tests/test_utils/recipes/h100/moe-dynamic-inference-with-coordinator.yaml +++ b/tests/test_utils/recipes/h100/moe-dynamic-inference-with-coordinator.yaml @@ -41,7 +41,7 @@ spec: "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" "DATA_PATH=/mnt/artifacts/" "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py" + "TRAINING_SCRIPT_PATH=examples/inference/legacy/gpt_dynamic_inference_with_coordinator.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" diff --git a/tests/test_utils/recipes/h100/moe-dynamic-inference.yaml b/tests/test_utils/recipes/h100/moe-dynamic-inference.yaml index fc1c07231c3..9f64afd0451 100644 --- a/tests/test_utils/recipes/h100/moe-dynamic-inference.yaml +++ b/tests/test_utils/recipes/h100/moe-dynamic-inference.yaml @@ -41,7 +41,7 @@ spec: "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_dynamic_inference.py" + "TRAINING_SCRIPT_PATH=examples/inference/legacy/gpt_dynamic_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" diff --git a/tests/test_utils/recipes/h100/moe-static-inference.yaml b/tests/test_utils/recipes/h100/moe-static-inference.yaml index fdab3ff430b..383b98a6650 100644 --- a/tests/test_utils/recipes/h100/moe-static-inference.yaml +++ b/tests/test_utils/recipes/h100/moe-static-inference.yaml @@ -41,7 +41,7 @@ spec: "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" "DATA_PATH=null" "DATA_CACHE_PATH=/workspace/data/cache" - "TRAINING_SCRIPT_PATH=examples/inference/gpt/gpt_static_inference.py" + "TRAINING_SCRIPT_PATH=examples/inference/legacy/gpt_static_inference.py" "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" "OUTPUT_PATH={assets_dir}" From 589626d673107d51942116a085c33e72ed21f22f Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Thu, 7 May 2026 18:53:03 -0700 Subject: [PATCH 10/15] test(inference): add unit tests for the high-level inference API Co-Authored-By: Claude Opus 4.7 (1M context) --- .../inference/high_level_api/__init__.py | 0 .../test_async_llm_serve_guard.py | 65 ++++++++++ .../high_level_api/test_event_loop_manager.py | 98 +++++++++++++++ .../high_level_api/test_lifecycle_guards.py | 112 ++++++++++++++++++ .../high_level_api/test_normalize_prompts.py | 42 +++++++ .../high_level_api/test_serve_config.py | 27 +++++ .../high_level_api/test_validation.py | 61 ++++++++++ 7 files changed, 405 insertions(+) create mode 100644 tests/unit_tests/inference/high_level_api/__init__.py create mode 100644 tests/unit_tests/inference/high_level_api/test_async_llm_serve_guard.py create mode 100644 tests/unit_tests/inference/high_level_api/test_event_loop_manager.py create mode 100644 tests/unit_tests/inference/high_level_api/test_lifecycle_guards.py create mode 100644 tests/unit_tests/inference/high_level_api/test_normalize_prompts.py create mode 100644 tests/unit_tests/inference/high_level_api/test_serve_config.py create mode 100644 tests/unit_tests/inference/high_level_api/test_validation.py diff --git a/tests/unit_tests/inference/high_level_api/__init__.py b/tests/unit_tests/inference/high_level_api/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit_tests/inference/high_level_api/test_async_llm_serve_guard.py b/tests/unit_tests/inference/high_level_api/test_async_llm_serve_guard.py new file mode 100644 index 00000000000..341a2465736 --- /dev/null +++ b/tests/unit_tests/inference/high_level_api/test_async_llm_serve_guard.py @@ -0,0 +1,65 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import asyncio +from unittest.mock import MagicMock + +import pytest + +import megatron.inference._llm_base as base_mod +from megatron.inference.async_llm import MegatronAsyncLLM +from megatron.inference.serve_config import ServeConfig + + +@pytest.fixture +def mock_pipeline(monkeypatch): + monkeypatch.setattr(base_mod, "DynamicInferenceContext", MagicMock()) + monkeypatch.setattr(base_mod, "GPTInferenceWrapper", MagicMock()) + monkeypatch.setattr(base_mod, "TextGenerationController", MagicMock()) + monkeypatch.setattr(base_mod, "DynamicInferenceEngine", MagicMock()) + + +@pytest.fixture +def fake_model_and_tokenizer(): + model = MagicMock() + model.config = MagicMock() + tokenizer = MagicMock() + return model, tokenizer + + +class TestAsyncLLMServeGuard: + @pytest.mark.asyncio + async def test_serve_requires_use_coordinator( + self, mock_pipeline, fake_model_and_tokenizer + ): + model, tok = fake_model_and_tokenizer + llm = MegatronAsyncLLM(model=model, tokenizer=tok) # direct mode + with pytest.raises(ValueError, match="requires use_coordinator=True"): + await llm.serve(ServeConfig()) + + @pytest.mark.asyncio + async def test_direct_mode_generate_is_single_caller( + self, mock_pipeline, fake_model_and_tokenizer, monkeypatch + ): + model, tok = fake_model_and_tokenizer + llm = MegatronAsyncLLM(model=model, tokenizer=tok) + + # Replace _generate_impl with a coroutine that holds until released, + # so the first call stays in flight while the second one starts. + gate = asyncio.Event() + + async def slow_impl(prompts, sp): + await gate.wait() + return [MagicMock() for _ in prompts] + + monkeypatch.setattr(llm, "_generate_impl", slow_impl) + + first = asyncio.create_task(llm.generate("hello")) + # Yield control so ``first`` reaches the await on ``gate``. + await asyncio.sleep(0) + + with pytest.raises(RuntimeError, match="single-caller"): + await llm.generate("world") + + # Release the first call so the test cleans up. + gate.set() + await first diff --git a/tests/unit_tests/inference/high_level_api/test_event_loop_manager.py b/tests/unit_tests/inference/high_level_api/test_event_loop_manager.py new file mode 100644 index 00000000000..73aecaeeda6 --- /dev/null +++ b/tests/unit_tests/inference/high_level_api/test_event_loop_manager.py @@ -0,0 +1,98 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import threading + +import pytest + +from megatron.inference._llm_base import _EventLoopManager + + +class TestEventLoopManager: + def test_loop_property_raises_before_start(self): + mgr = _EventLoopManager() + with pytest.raises(RuntimeError, match="start"): + _ = mgr.loop + + def test_start_is_idempotent_and_loop_accessible(self): + mgr = _EventLoopManager() + mgr.start() + try: + loop1 = mgr.loop + mgr.start() # double-start should be a no-op + loop2 = mgr.loop + assert loop1 is loop2 + assert loop1.is_running() + finally: + mgr.stop() + + def test_submit_run_sync_run_async(self): + mgr = _EventLoopManager() + mgr.start() + try: + async def coro(): + return 42 + + # submit returns a concurrent future + fut = mgr.submit(coro()) + assert fut.result() == 42 + + # run_sync blocks on result + assert mgr.run_sync(coro()) == 42 + + # run_async awaits the future from another loop + import asyncio + + assert asyncio.run(mgr.run_async(coro())) == 42 + finally: + mgr.stop() + + def test_stop_is_idempotent_and_joins_thread(self): + mgr = _EventLoopManager() + mgr.start() + thread = mgr._thread + assert thread is not None + mgr.stop() + assert not thread.is_alive() + # Second stop should be a no-op + mgr.stop() + + def test_start_propagates_cuda_device_to_daemon_thread(self, monkeypatch): + """The bug-fix invariant: daemon thread must inherit the spawning + thread's CUDA device via ``torch.cuda.set_device``.""" + import torch + + recorded = {} + main_thread_id = threading.get_ident() + + monkeypatch.setattr(torch.cuda, "is_available", lambda: True) + monkeypatch.setattr(torch.cuda, "current_device", lambda: 7) + + def fake_set_device(device): + recorded["device"] = device + recorded["thread_id"] = threading.get_ident() + + monkeypatch.setattr(torch.cuda, "set_device", fake_set_device) + + mgr = _EventLoopManager() + mgr.start() + try: + assert recorded.get("device") == 7 + # Must have run on the daemon thread, not on the spawning thread. + assert recorded.get("thread_id") != main_thread_id + finally: + mgr.stop() + + def test_start_skips_set_device_when_cuda_unavailable(self, monkeypatch): + import torch + + called = [] + + monkeypatch.setattr(torch.cuda, "is_available", lambda: False) + monkeypatch.setattr(torch.cuda, "set_device", lambda d: called.append(d)) + + mgr = _EventLoopManager() + mgr.start() + try: + assert called == [] + finally: + mgr.stop() diff --git a/tests/unit_tests/inference/high_level_api/test_lifecycle_guards.py b/tests/unit_tests/inference/high_level_api/test_lifecycle_guards.py new file mode 100644 index 00000000000..76dffbc3968 --- /dev/null +++ b/tests/unit_tests/inference/high_level_api/test_lifecycle_guards.py @@ -0,0 +1,112 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +from unittest.mock import MagicMock + +import pytest + +import megatron.inference._llm_base as base_mod +from megatron.inference.async_llm import MegatronAsyncLLM +from megatron.inference.llm import MegatronLLM + + +@pytest.fixture +def mock_pipeline(monkeypatch): + """Stub out the engine pipeline so the constructor runs without torch/megatron.""" + monkeypatch.setattr(base_mod, "DynamicInferenceContext", MagicMock()) + monkeypatch.setattr(base_mod, "GPTInferenceWrapper", MagicMock()) + monkeypatch.setattr(base_mod, "TextGenerationController", MagicMock()) + monkeypatch.setattr(base_mod, "DynamicInferenceEngine", MagicMock()) + + +@pytest.fixture +def fake_model_and_tokenizer(): + model = MagicMock() + model.config = MagicMock() + tokenizer = MagicMock() + return model, tokenizer + + +def _make_worker_instance(cls): + """Build a coordinator-mode worker-rank instance without running the + constructor's engine/runtime setup.""" + obj = cls.__new__(cls) + obj._engine = MagicMock() + obj._context = MagicMock() + obj._controller = MagicMock() + obj._use_coordinator = True + obj._is_primary_rank = False + obj._loop_manager = None + obj._coord_runtime = None + obj._shutdown_called = False + if cls is MegatronAsyncLLM: + obj._direct_generate_in_flight = False + obj._serve_started = False + return obj + + +class TestDirectModeLifecycleGuards: + """Direct mode: pause/unpause/suspend/resume must raise; shutdown is a no-op.""" + + def test_sync_lifecycle_raises_in_direct_mode( + self, mock_pipeline, fake_model_and_tokenizer + ): + model, tok = fake_model_and_tokenizer + llm = MegatronLLM(model=model, tokenizer=tok) + for method in ("pause", "unpause", "suspend", "resume"): + with pytest.raises(RuntimeError, match="use_coordinator=True"): + getattr(llm, method)() + # shutdown / wait_for_shutdown are no-ops, not errors. + llm.shutdown() + llm.wait_for_shutdown() + + @pytest.mark.asyncio + async def test_async_lifecycle_raises_in_direct_mode( + self, mock_pipeline, fake_model_and_tokenizer + ): + model, tok = fake_model_and_tokenizer + llm = MegatronAsyncLLM(model=model, tokenizer=tok) + for method in ("pause", "unpause", "suspend", "resume"): + with pytest.raises(RuntimeError, match="use_coordinator=True"): + await getattr(llm, method)() + # shutdown / wait_for_shutdown are no-ops in direct mode. + await llm.shutdown() + await llm.wait_for_shutdown() + + +class TestCoordinatorWorkerRankGuards: + """Coordinator mode + non-primary rank: generate must raise.""" + + def test_sync_generate_raises_on_worker_rank(self): + llm = _make_worker_instance(MegatronLLM) + with pytest.raises(RuntimeError, match="primary rank"): + llm.generate("hello") + + @pytest.mark.asyncio + async def test_async_generate_raises_on_worker_rank(self): + llm = _make_worker_instance(MegatronAsyncLLM) + with pytest.raises(RuntimeError, match="primary rank"): + await llm.generate("hello") + + +class TestShutdownIdempotence: + def test_sync_shutdown_idempotent_in_direct_mode( + self, mock_pipeline, fake_model_and_tokenizer + ): + model, tok = fake_model_and_tokenizer + llm = MegatronLLM(model=model, tokenizer=tok) + llm.shutdown() + assert llm._shutdown_called is True + # Second shutdown should be a no-op (idempotent). + llm.shutdown() + assert llm._shutdown_called is True + + @pytest.mark.asyncio + async def test_async_shutdown_idempotent_in_direct_mode( + self, mock_pipeline, fake_model_and_tokenizer + ): + model, tok = fake_model_and_tokenizer + llm = MegatronAsyncLLM(model=model, tokenizer=tok) + await llm.shutdown() + assert llm._shutdown_called is True + await llm.shutdown() + assert llm._shutdown_called is True diff --git a/tests/unit_tests/inference/high_level_api/test_normalize_prompts.py b/tests/unit_tests/inference/high_level_api/test_normalize_prompts.py new file mode 100644 index 00000000000..95de353d3d2 --- /dev/null +++ b/tests/unit_tests/inference/high_level_api/test_normalize_prompts.py @@ -0,0 +1,42 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +import pytest + +from megatron.inference._llm_base import _MegatronLLMBase + + +def _normalize(prompts): + """Bypass __init__ (which builds the engine pipeline) and call the method + on a bare instance.""" + obj = _MegatronLLMBase.__new__(_MegatronLLMBase) + return obj._normalize_prompts(prompts) + + +class TestNormalizePrompts: + def test_single_string(self): + assert _normalize("abc") == (["abc"], False) + + def test_single_token_id_list(self): + assert _normalize([1, 2, 3]) == ([[1, 2, 3]], False) + + def test_batch_of_strings(self): + assert _normalize(["a", "b"]) == (["a", "b"], True) + + def test_batch_of_token_id_lists(self): + assert _normalize([[1, 2], [3, 4]]) == ([[1, 2], [3, 4]], True) + + def test_empty_list_is_batch(self): + assert _normalize([]) == ([], True) + + @pytest.mark.parametrize( + "bad_input", + [ + {1, 2}, # set + 1.5, # float + [1.5], # list of floats (first elem is float) + {"k": "v"}, # dict + ], + ) + def test_unsupported_inputs_raise_typeerror(self, bad_input): + with pytest.raises(TypeError): + _normalize(bad_input) diff --git a/tests/unit_tests/inference/high_level_api/test_serve_config.py b/tests/unit_tests/inference/high_level_api/test_serve_config.py new file mode 100644 index 00000000000..1e4b1ca63b8 --- /dev/null +++ b/tests/unit_tests/inference/high_level_api/test_serve_config.py @@ -0,0 +1,27 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +from megatron.inference.serve_config import ServeConfig + + +class TestServeConfig: + def test_defaults(self): + cfg = ServeConfig() + assert cfg.host == "0.0.0.0" + assert cfg.port == 5000 + assert cfg.parsers == [] + assert cfg.verbose is False + assert cfg.frontend_replicas == 4 + + def test_overrides_preserved(self): + cfg = ServeConfig( + host="127.0.0.1", + port=8080, + parsers=["json", "tool_use"], + verbose=True, + frontend_replicas=1, + ) + assert cfg.host == "127.0.0.1" + assert cfg.port == 8080 + assert cfg.parsers == ["json", "tool_use"] + assert cfg.verbose is True + assert cfg.frontend_replicas == 1 diff --git a/tests/unit_tests/inference/high_level_api/test_validation.py b/tests/unit_tests/inference/high_level_api/test_validation.py new file mode 100644 index 00000000000..fb7d260b8f1 --- /dev/null +++ b/tests/unit_tests/inference/high_level_api/test_validation.py @@ -0,0 +1,61 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +from unittest.mock import MagicMock + +import pytest + +import megatron.inference._llm_base as base_mod +from megatron.inference.async_llm import MegatronAsyncLLM +from megatron.inference.llm import MegatronLLM + + +@pytest.fixture +def mock_pipeline(monkeypatch): + """Stub out the engine pipeline so the constructor runs without torch/megatron.""" + monkeypatch.setattr(base_mod, "DynamicInferenceContext", MagicMock()) + monkeypatch.setattr(base_mod, "GPTInferenceWrapper", MagicMock()) + monkeypatch.setattr(base_mod, "TextGenerationController", MagicMock()) + monkeypatch.setattr(base_mod, "DynamicInferenceEngine", MagicMock()) + + +@pytest.fixture +def fake_model_and_tokenizer(): + model = MagicMock() + model.config = MagicMock() + tokenizer = MagicMock() + return model, tokenizer + + +@pytest.mark.parametrize("cls", [MegatronLLM, MegatronAsyncLLM]) +class TestConstructorValidation: + def test_coordinator_host_without_use_coordinator_raises( + self, cls, mock_pipeline, fake_model_and_tokenizer + ): + model, tok = fake_model_and_tokenizer + with pytest.raises(ValueError, match="coordinator_host/port require use_coordinator=True"): + cls( + model=model, + tokenizer=tok, + use_coordinator=False, + coordinator_host="x", + ) + + def test_coordinator_port_without_use_coordinator_raises( + self, cls, mock_pipeline, fake_model_and_tokenizer + ): + model, tok = fake_model_and_tokenizer + with pytest.raises(ValueError, match="coordinator_host/port require use_coordinator=True"): + cls( + model=model, + tokenizer=tok, + use_coordinator=False, + coordinator_port=5000, + ) + + def test_direct_mode_constructor_succeeds( + self, cls, mock_pipeline, fake_model_and_tokenizer + ): + model, tok = fake_model_and_tokenizer + llm = cls(model=model, tokenizer=tok) + assert llm.is_primary_rank is True + assert llm._use_coordinator is False From 2eb2e819a444f806434f6c77cfbde321e925f822 Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Thu, 7 May 2026 19:11:38 -0700 Subject: [PATCH 11/15] test(inference): add functional tests for offline_inference 4 modes with reused legacy goldens Co-Authored-By: Claude Opus 4.7 (1M context) --- .../golden_values_dev_dgx_h100.json | 2699 +++++++++++++++++ .../model_config.yaml | 66 + .../golden_values_dev_dgx_h100.json | 158 + .../model_config.yaml | 62 + .../golden_values_dev_dgx_h100.json | 2699 +++++++++++++++++ .../model_config.yaml | 65 + .../golden_values_dev_dgx_h100.json | 158 + .../model_config.yaml | 61 + .../recipes/h100/gpt-offline-inference.yaml | 77 + 9 files changed, 6045 insertions(+) create mode 100644 tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_583m_logitsmatch/model_config.yaml create mode 100644 tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_dp8_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml create mode 100644 tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_583m_logitsmatch/model_config.yaml create mode 100644 tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_dp8_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json create mode 100644 tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml create mode 100644 tests/test_utils/recipes/h100/gpt-offline-inference.yaml diff --git a/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..dd8b08e446f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json @@ -0,0 +1,2699 @@ +{ + "0": { + "input_prompt": "The $500 Cup of coffee?\nConsider this, most Americans spend an average of $1,500-2,000 a year on this bean water.\nI have a few question for you: \nHow has business been the past few months?\nDo you ever feel like your business is stuck?\nDon't feel like you're able to improve performance and make changes required to achieve success ?\nAre your customers spneding less and less and wanting more?\nHave the gas prices affected your business?\nDo you have employees and do they hate you or wish they could quit?\n\nNow, before you and I can decide wheter or not I will be a good fit for your business we should talk this over with coffee.\nAnd, just to warn you this isn't some casual thing. This is not a date or time to be personal or social (but by all means share what you will coz I'll gladly listen).\nTher eare two major talking points and stratagies we will focios on in our lil coffee social\nFor one, we will find your unique selling Proposition (USP).\nDo have the best price? Are you the cheapest in town? Are your customers jerks? Do you haVE REGULARS? Why do people come back?\nwe'll also look for the holes in your business bucket. I'm willing to bet there's a hole or two in your business we'll find together that'll make this 500 dollar cup of Joe pay for itse;f immedietly.\nMany find themselves to be more profitable by just finding out where the dollars are escaping in their business and I like to think of myself as a guy that comes along with some spakel or putty and patch those holes up for you.\nBeleive me, just fixing one hole can mean a lot...just think about a sinking boat that has a hole in it that's about 3\u201d in diameter... it doesn't take long to sink.\nI have no agenda, besides f=getting to know your business and seeing wher I can patch the holes and find what makes you do darn unique (I know this won't take long.)\nMany folks, I bet, will find what they need to get off their chest with a quick phone call and they just paypal me the money and make a coffee at home. Look, that's fine too.\nI just to get you ot of your comfort zone, because this is where it all starts my frind.\nSome smart GOAT entrepreneur will probably get everything they need just out of our lil mini consulatant for the more extensive business consukting I offer, and look, that's fine, too.\nMaybe this cup of coffee will be all you need to gtet back on your feet, but not only surive, but thrive!\nI'm not trying to make frineds, or make a bunch of money, or look down your shirt\nBut this is only going to be a 45 minute (max) coffee chat\nAnd, I know you (and me) both have a lot on our plates. So no messing around\nAfter our consultation I will follow up with you in a few days and see how things are going, then I will be emailing you about once or twice every two weeks, just to make sure you're staying on task and implementing what we went over.\nTghere is no obligation to go any further and will gladly give you back your money if this pre-consultation doesn't put you on the right path or you don't get any value out of it...", + "generated_text": " $ is a$ is a $ is a $ is a $ is a $ is a $$1, you\n$ $$ $\n$ $- $\n$\n$\n$ $$$\n$\n$\n$\n$\n$\n$\n$\n$???????, $\n-1\n$\n1.5.\n$\n$, you\n$.\n$\n1,1.1\nI\n$.\nI\n\n\nHow\n$,,,0,1,0,0.0\nIn\nThe\nThe\nThe\n", + "generated_tokens": [ + 1659, + 1395, + 1261, + 1036, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1036, + 1049, + 1044, + 1636, + 1010, + 1036, + 1659, + 1036, + 1659, + 1010, + 1036, + 1659, + 1045, + 1659, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1659, + 1036, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1044, + 1659, + 1010, + 1045, + 1049, + 1010, + 1036, + 1010, + 1049, + 1046, + 1053, + 1046, + 1010, + 1036, + 1010, + 1036, + 1044, + 1636, + 1010, + 1036, + 1046, + 1010, + 1036, + 1010, + 1049, + 1044, + 1049, + 1046, + 1049, + 1010, + 1073, + 1010, + 1036, + 1046, + 1010, + 1073, + 1010, + 1010, + 1010, + 7801, + 1010, + 1036, + 1044, + 1044, + 1044, + 1048, + 1044, + 1049, + 1044, + 1048, + 1044, + 1048, + 1046, + 1048, + 1010, + 1785, + 1010, + 1784, + 1010, + 1784, + 1010, + 1784, + 1010 + ], + "latency": 9.61214280128479, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.7319135665893555, + -2.188307285308838, + -0.7547445297241211, + -0.7294313311576843, + -10.238386154174805, + -3.3775341510772705, + -6.394498825073242, + -7.354557037353516, + -9.018157958984375, + -3.012073040008545, + -3.2584073543548584, + -5.220732688903809, + -4.620487213134766, + -2.5078930854797363, + -3.752683162689209, + -0.13360372185707092, + -0.05705544352531433, + -0.41462242603302, + -1.585279941558838, + -1.6438164710998535, + -1.9557222127914429, + -0.3989897072315216, + -0.0365302674472332, + -6.368816375732422, + -0.8731719255447388, + -0.022585075348615646, + -0.2775891423225403, + -0.0027362785767763853, + -0.0006812873762100935, + -1.581446647644043, + -0.008688976056873798, + -0.3532317280769348, + -6.071163177490234, + -9.162371635437012, + -9.965556144714355, + -2.400461196899414, + -2.9898362159729004, + -2.9803032875061035, + -2.12601900100708, + -3.500912666320801, + -7.015069007873535, + -2.278961420059204, + -0.46380555629730225, + -4.078739166259766, + -1.9430254697799683, + -3.5642244815826416, + -3.689701795578003, + -6.201474189758301, + -6.580833911895752, + -2.3081111907958984, + -5.42717170715332, + -1.1886008977890015, + -1.172760248184204, + -1.3571951389312744, + -1.3551844358444214, + -3.376784324645996, + -0.05118789151310921, + -4.064360618591309, + -2.575554847717285, + -0.6994737386703491, + -2.56724214553833, + -2.1888976097106934, + -0.4816131591796875, + -4.070178985595703, + -2.0060782432556152, + -6.858033180236816, + -0.059200502932071686, + -3.214278221130371, + -0.9671833515167236, + -0.823198676109314, + -1.0130078792572021, + -4.595561981201172, + -0.012724989093840122, + -5.214311599731445, + -8.246870040893555, + -3.1476030349731445, + -3.299684524536133, + -4.218191146850586, + -7.318399429321289, + -0.8580498695373535, + -3.0894036293029785, + -1.886361002922058, + -7.217658996582031, + -3.271679639816284, + -3.9717154502868652, + -1.8835484981536865, + -10.034332275390625, + -11.382490158081055, + -5.417011260986328, + -7.505967140197754, + -2.33837890625, + -0.07904055714607239, + -3.294971227645874, + -7.813640594482422, + -1.7646901607513428, + -4.025320053100586, + -3.5977325439453125, + -4.390352249145508, + -9.147806167602539, + -0.5303041934967041, + -7.721246242523193, + -0.6311959028244019, + -0.8119025230407715, + -0.7227814197540283, + -1.8369406461715698, + -0.20933297276496887, + -1.5395950078964233, + -4.424448490142822, + -4.084965705871582, + -3.355497360229492, + -1.0475609302520752, + -6.479413986206055, + -0.7810530662536621, + -2.132437229156494, + -6.648703098297119, + -2.9522438049316406, + -1.2485712766647339, + -4.040503025054932, + -2.3415768146514893, + -5.358206748962402, + -1.6258506774902344, + -3.956300973892212, + -0.732298731803894, + -7.441117286682129, + -1.5242161750793457, + -2.4555861949920654, + -4.295163154602051, + -9.687600135803223, + -0.8213484883308411, + -1.2446978092193604, + -0.01942702941596508, + -4.619411468505859, + -3.3297007083892822, + -2.2139487266540527, + -3.691431999206543, + -2.6574106216430664, + -6.075929641723633, + -0.6123450994491577, + -1.2942559719085693, + -0.6262839436531067, + -7.398006439208984, + -4.4869890213012695, + -4.202048301696777, + -4.982994079589844, + -0.637227475643158, + -3.061023235321045, + -10.117584228515625, + -3.8567495346069336, + -4.0480828285217285, + -2.472019672393799, + -4.246374607086182, + -1.3939155340194702, + -7.132441520690918, + -0.20108745992183685, + -4.986658573150635, + -4.387957572937012, + -0.01108358334749937, + -4.209756851196289, + -7.271108627319336, + -4.047314643859863, + -2.6497321128845215, + -1.4763175249099731, + -0.28365400433540344, + -3.5247769355773926, + -1.4226995706558228, + -4.327237129211426, + -2.0407187938690186, + -6.1437907218933105, + -1.5190880298614502, + -2.5511486530303955, + -7.504094123840332, + -2.152172565460205, + -6.708334922790527, + -6.913146495819092, + -3.6959621906280518, + -6.752341270446777, + -0.63083815574646, + -0.12433214485645294, + -5.0525641441345215, + -4.435934066772461, + -0.45601028203964233, + -6.3459577560424805, + -9.882917404174805, + -3.1422882080078125, + -2.550520658493042, + -3.2099051475524902, + -6.278127193450928, + -0.07764133810997009, + -3.155696153640747, + -1.933587670326233, + -9.61027717590332, + -6.211391925811768, + -4.664543151855469, + -6.783782005310059, + -5.676271438598633, + -8.605900764465332, + -0.0824289619922638, + -3.5463995933532715, + -13.374168395996094, + -1.2401021718978882, + -1.8734056949615479, + -3.4154422283172607, + -1.6733763217926025, + -17.633970260620117, + -9.345113754272461, + -0.6277351975440979, + -2.9617538452148438, + -2.5565333366394043, + -10.10580825805664, + -7.130337715148926, + -7.36820125579834, + -4.098911285400391, + -5.747079372406006, + -2.945054769515991, + -0.7887389063835144, + -1.6583149433135986, + -1.0165244340896606, + -6.581666946411133, + -5.926386833190918, + -5.845194339752197, + -0.9657630920410156, + -7.868755340576172, + -1.3244551420211792, + -0.2657390236854553, + -0.06403665244579315, + -2.983020782470703, + -5.943899631500244, + -7.877285957336426, + -3.593116283416748, + -3.819509506225586, + -7.226177215576172, + -2.5206997394561768, + -3.385587215423584, + -0.37499159574508667, + -1.4698283672332764, + -3.1460342407226562, + -0.0077166082337498665, + -4.350916862487793, + -3.2183218002319336, + -0.6242184638977051, + -1.4782464504241943, + -2.8054311275482178, + -3.0831401348114014, + -12.17662525177002, + -2.113419532775879, + -1.6448111534118652, + -2.1834323406219482, + -0.7630388140678406, + -10.1896390914917, + -6.234405517578125, + -11.46288776397705, + -1.003785490989685, + -4.211658477783203, + -1.5010679960250854, + -5.859302043914795, + -2.0465080738067627, + -3.7468819618225098, + -4.684195518493652, + -4.318704128265381, + -2.7234389781951904, + -9.00437068939209, + -3.043811321258545, + -3.1384406089782715, + -2.713779926300049, + -2.095993995666504, + -2.1484954357147217, + -10.274479866027832, + -0.682350754737854, + -0.25973302125930786, + -3.6964316368103027, + -13.434456825256348, + -2.3368239402770996, + -5.382724761962891, + -1.9073458909988403, + -5.905669212341309, + -0.032165709882974625, + -1.6530004739761353, + -2.728893280029297, + -1.640552043914795, + -1.1391171216964722, + -1.4353511333465576, + -4.003787994384766, + -0.3450564742088318, + -0.7168521285057068, + -0.34650325775146484, + -0.3616408705711365, + -7.062709331512451, + -1.2851682901382446, + -2.299129009246826, + -8.800156593322754, + -5.208735466003418, + -4.780910491943359, + -2.78342342376709, + -4.469717979431152, + -6.909726619720459, + -2.5114197731018066, + -0.659822404384613, + -0.6915416121482849, + -3.2363741397857666, + -0.5283617377281189, + -0.10473938286304474, + -6.215325832366943, + -7.283237934112549, + -1.6797031164169312, + -11.50100040435791, + -7.5822978019714355, + -3.387317657470703, + -11.407575607299805, + -5.441976547241211, + -3.3264851570129395, + -0.7265786528587341, + -1.382750153541565, + -7.841699600219727, + -8.105277061462402, + -3.9569506645202637, + -4.963083267211914, + -0.5492897629737854, + -4.6081390380859375, + -5.870400905609131, + -3.957930088043213, + -5.275494575500488, + -4.105091094970703, + -2.15435528755188, + -2.8472700119018555, + -1.1278448104858398, + -8.226571083068848, + -0.40629008412361145, + -9.916461944580078, + -4.616743087768555, + -1.691868543624878, + -0.6639478802680969, + -2.5716753005981445, + -6.676954746246338, + -6.535329818725586, + -0.4170510768890381, + -1.443942904472351, + -3.145481824874878, + -1.440589427947998, + -0.26935356855392456, + -0.9647155404090881, + -4.335958957672119, + -1.5647850036621094, + -5.890466690063477, + -3.01654052734375, + -1.9168468713760376, + -3.7365682125091553, + -8.001864433288574, + -10.680083274841309, + -4.489352226257324, + -4.6058149337768555, + -7.69011116027832, + -3.6247005462646484, + -1.5600426197052002, + -10.2160062789917, + -5.004643440246582, + -0.19602319598197937, + -3.375545024871826, + -2.669325590133667, + -1.3932737112045288, + -1.6410658359527588, + -6.847603797912598, + -6.744344711303711, + -0.5215591192245483, + -0.25840020179748535, + -1.1448237895965576, + -5.57253885269165, + -7.251138687133789, + -4.221924781799316, + -0.7688062787055969, + -2.504502534866333, + -3.146519660949707, + -2.206653356552124, + -1.4295082092285156, + -7.96943998336792, + -4.332189083099365, + -2.5750505924224854, + -1.7102608680725098, + -5.311381816864014, + -8.897522926330566, + -2.994919538497925, + -3.3397974967956543, + -2.1794328689575195, + -2.437566041946411, + -0.3181810975074768, + -0.27412793040275574, + -0.7914466857910156, + -2.3470635414123535, + -2.4099245071411133, + -2.491870880126953, + -3.024170160293579, + -1.9719040393829346, + -11.373910903930664, + -1.4279751777648926, + -0.14573107659816742, + -2.055763006210327, + -6.366893291473389, + -4.24091911315918, + -0.00709194503724575, + -2.0199716091156006, + -2.524750232696533, + -1.4272525310516357, + -0.5185190439224243, + -2.927150011062622, + -2.7070627212524414, + -3.365638017654419, + -4.318085193634033, + -7.773144721984863, + -1.7947180271148682, + -7.657534599304199, + -8.767786026000977, + -14.74280071258545, + -1.8042558431625366, + -3.2712037563323975, + -1.4002125263214111, + -4.887944221496582, + -1.4821010828018188, + -1.5255622863769531, + -5.879070281982422, + -4.463839530944824, + -5.1955976486206055, + -5.665647506713867, + -0.3775045573711395, + -5.9350481033325195, + -2.800539255142212, + -0.13162286579608917, + -3.034379720687866, + -4.729524612426758, + -4.6252641677856445, + -3.850942611694336, + -2.4760568141937256, + -6.059760093688965, + -10.12075138092041, + -0.9469369649887085, + -11.595907211303711, + -6.875324726104736, + -4.268826007843018, + -2.835529088973999, + -3.8626279830932617, + -4.876199245452881, + -0.013071090914309025, + -4.964417934417725, + -0.7445687055587769, + -5.707155227661133, + -6.10660457611084, + -4.317755699157715, + -4.440443992614746, + -2.9202542304992676, + -4.743522644042969, + -1.2569392919540405, + -2.8675737380981445, + -2.3151841163635254, + -4.318130970001221, + -1.9054772853851318, + -1.1808521747589111, + -0.765956461429596, + -2.768916606903076, + -6.237791061401367, + -1.7224305868148804, + -7.137521743774414, + -4.512486457824707, + -1.9069950580596924, + -4.145983695983887, + -5.365190505981445, + -0.059828490018844604, + -2.273892879486084, + -3.4013004302978516, + -5.035730361938477, + -6.501443386077881, + -9.903446197509766, + -1.6332892179489136, + -2.1572084426879883, + -1.6149548292160034, + -1.4698481559753418, + -6.01010799407959, + -2.2243528366088867, + -6.900836944580078, + -6.0930986404418945, + -2.974020481109619, + -3.225423574447632, + -8.423272132873535, + -1.3423724174499512, + -3.626147508621216, + -0.4862469434738159, + -6.860866546630859, + -3.8910953998565674, + -2.33319354057312, + -1.7229185104370117, + -2.215972423553467, + -8.99046516418457, + -4.099084854125977, + -2.4191012382507324, + -8.288970947265625, + -2.9641928672790527, + -1.5036451816558838, + -3.0544614791870117, + -0.0715634673833847, + -2.444031238555908, + -4.520998954772949, + -3.972568988800049, + -0.4985870122909546, + -2.1651363372802734, + -3.4427435398101807, + -1.730639100074768, + -0.9458961486816406, + -7.740211009979248, + -9.39163875579834, + -3.895984172821045, + -1.7523534297943115, + -5.41331672668457, + -8.910720825195312, + -12.971094131469727, + -3.0455880165100098, + -10.501265525817871, + -3.3864927291870117, + -4.842309951782227, + -3.9964733123779297, + -7.3046793937683105, + -2.6607093811035156, + -1.3541781902313232, + -5.003270626068115, + -3.944551944732666, + -0.11356143653392792, + -5.174440383911133, + -9.628616333007812, + -8.654989242553711, + -8.980416297912598, + -6.670101642608643, + -5.488286018371582, + -5.943419933319092, + -2.126483201980591, + -8.054739952087402, + -7.458671569824219, + -2.5267202854156494, + -6.455472946166992, + -8.655346870422363, + -7.903901100158691, + -6.221062660217285, + -7.129237174987793, + -4.2345380783081055, + -2.5375306606292725, + -7.697700500488281, + -1.567080020904541, + -2.084331750869751, + -0.25020831823349, + -1.5145041942596436, + -4.619244575500488, + -0.2970108985900879, + -0.4977554678916931, + -6.197869300842285, + -4.030620098114014, + -7.232107639312744, + -0.21076253056526184, + -1.563366174697876, + -1.133756160736084, + -2.708237648010254, + -4.080535888671875, + -0.6818401217460632, + -0.1864331066608429, + -0.49012088775634766, + -8.732468605041504, + -11.945040702819824, + -5.243098735809326, + -1.5294703245162964, + -0.8935543298721313, + -0.6174070835113525, + -1.5068217515945435, + -3.5766501426696777, + -5.393096923828125, + -4.202867031097412, + -14.765748023986816, + -5.2513813972473145, + -0.7597705721855164, + -0.2502063810825348, + -1.7403976917266846, + -2.8000779151916504, + -1.9808133840560913, + -2.1654744148254395, + -1.8629226684570312, + -3.222038745880127, + -0.040942225605249405, + -2.3384013175964355, + -10.210381507873535, + -4.5859761238098145, + -0.5805734395980835, + -3.7019288539886475, + -2.001936674118042, + -2.7876083850860596, + -2.9799084663391113, + -4.349887371063232, + -0.0792960673570633, + -1.4366114139556885, + -1.0813264846801758, + -1.3510822057724, + -6.7060699462890625, + -5.436615943908691, + -3.978389263153076, + -6.785447597503662, + -6.147171497344971, + -3.97414231300354, + -4.332991600036621, + -0.9269428253173828, + -5.1237101554870605, + -4.486598968505859, + -0.04678357392549515, + -1.0307552814483643, + -1.4249452352523804, + -4.517682075500488, + -3.561821699142456, + -2.0815205574035645, + -0.6041194200515747, + -5.992964744567871, + -7.092092514038086, + -0.48916709423065186, + -2.6405677795410156, + -4.3345723152160645, + -3.533582925796509, + -3.1233346462249756, + -3.107872486114502, + -1.9901115894317627, + -3.1052846908569336, + -1.8440347909927368, + -6.21368408203125, + -1.8796799182891846, + -2.705214738845825, + -0.2987763583660126, + -4.070865154266357, + -1.6675832271575928, + -1.3896636962890625, + -1.5731089115142822, + -3.526170015335083, + -2.5088443756103516, + -1.208929419517517, + -3.673125743865967, + -2.501532554626465, + -6.875064373016357, + -8.512459754943848, + -1.042314052581787, + -3.657850980758667, + -7.0950798988342285, + -4.974049091339111, + -8.14085578918457, + -3.529888153076172, + -1.9389504194259644, + -7.0902204513549805, + -2.409292459487915, + -2.9428021907806396, + -1.688283085823059, + -3.622368335723877, + -2.0903351306915283, + -4.160663604736328, + -3.1683764457702637, + -1.2135626077651978, + -7.566033363342285, + -3.1186251640319824, + -5.899919509887695, + -0.9518840312957764, + -2.656729221343994, + -2.2994377613067627, + -6.806836128234863, + -1.280236840248108, + -2.838846206665039, + -1.3598848581314087, + -11.707776069641113, + -3.134333372116089, + -0.6230669617652893, + -8.219222068786621, + -7.562507152557373, + -7.489459037780762, + -1.5368008613586426, + -7.149652481079102, + -5.749268054962158, + -3.162869691848755, + -2.7235195636749268, + -6.128931999206543, + -1.1934199333190918, + -3.986410617828369, + -3.76609468460083, + -1.712721586227417, + -3.195504903793335, + -8.397743225097656, + -3.1260581016540527, + -9.792022705078125, + -4.217884540557861, + -11.583260536193848, + -5.987588882446289, + -5.178754806518555, + -6.994749069213867, + -5.167606353759766, + -7.124668121337891, + -6.201416015625, + -10.203682899475098, + -6.858526229858398, + -2.733592987060547, + -5.078882217407227, + -9.003358840942383, + -4.704894542694092, + -3.9085562229156494, + -7.247268199920654, + -7.091092109680176, + -4.4150166511535645, + -7.56699275970459, + -9.485116004943848, + -1.9977033138275146, + -6.65272331237793, + -2.236643075942993, + -7.518955707550049, + -5.525973320007324, + -4.67877721786499, + -6.608670234680176, + -5.536133766174316, + -10.772479057312012, + -10.8853178024292, + -3.6156129837036133, + -6.751470565795898, + -6.4537434577941895, + -3.4220399856567383, + -8.251005172729492, + -3.2146153450012207, + -6.330069541931152, + -1.5551663637161255, + -6.520583629608154, + -10.450878143310547, + -5.8788957595825195, + -3.7398200035095215, + -3.9084208011627197, + -0.3640081584453583, + -6.961522102355957, + -6.066243648529053, + -7.270624160766602, + -5.098455429077148, + -2.7642822265625, + -5.460171699523926, + -7.362828731536865, + -2.558631658554077, + -2.186410427093506, + -2.5309929847717285, + -2.46756649017334, + -2.0306026935577393, + -1.8713470697402954, + -2.108008623123169, + -1.2698389291763306, + -2.1712756156921387, + -2.4432802200317383, + -1.1477653980255127, + -1.8417484760284424, + -2.5971946716308594, + -1.8250831365585327, + -2.103092670440674, + -2.5183165073394775, + -2.9367291927337646, + -1.9412965774536133, + -1.7692793607711792, + -2.864521026611328, + -3.1332175731658936, + -1.098311185836792, + -2.946441173553467, + -2.2800471782684326, + -3.1929852962493896, + -2.754260778427124, + -3.485616445541382, + -3.3010287284851074, + -2.5537776947021484, + -2.6752865314483643, + -3.1617612838745117, + -2.4571690559387207, + -2.060081958770752, + -2.425969362258911, + -2.212725877761841, + -2.4232254028320312, + -3.0587053298950195, + -2.4074010848999023, + -2.457937479019165, + -2.319617986679077, + -2.6340954303741455, + -2.599524736404419, + -2.5302212238311768, + -1.6849274635314941, + -2.2609786987304688, + -2.039928674697876, + -1.9474098682403564, + -2.3550753593444824, + -1.718749761581421, + -2.413884162902832, + -1.6247628927230835, + -2.4784040451049805, + -1.828325629234314, + -1.3880831003189087, + -1.4448199272155762, + -1.1477117538452148, + -1.1669728755950928, + -1.8787822723388672, + -1.5565840005874634, + -1.6666553020477295, + -1.747725248336792, + -1.959598422050476, + -2.0376486778259277, + -2.345367431640625, + -2.055098533630371, + -1.3940613269805908, + -3.4385242462158203, + -2.7489635944366455, + -3.2590157985687256, + -3.1128957271575928, + -1.7070379257202148, + -3.9010369777679443, + -3.21574068069458, + -3.3850393295288086, + -1.8778185844421387, + -2.698211908340454, + -1.8060741424560547, + -2.0845324993133545, + -3.4797585010528564, + -2.263254404067993, + -3.083108901977539, + -1.6589758396148682, + -2.687279224395752, + -1.77505624294281, + -2.6142921447753906, + -1.934045672416687, + -1.8834377527236938, + -2.8038980960845947, + -1.550542950630188, + -3.4054152965545654, + -1.724036693572998, + -2.3146564960479736, + -1.5134503841400146, + -2.9289023876190186, + -1.5285141468048096, + -3.421035051345825, + -1.3757282495498657, + -3.441431760787964, + -1.5286564826965332, + -3.4372904300689697, + -3.173043966293335, + -1.1313854455947876, + -1.486415147781372, + -2.506413221359253, + -3.5165903568267822, + -1.4112176895141602, + -3.7175354957580566, + -2.2170844078063965, + -0.704839289188385, + -0.6626103520393372, + -2.5483946800231934, + -0.949668288230896, + -3.1339564323425293, + -1.3326977491378784, + -3.5493476390838623, + -0.6785370111465454, + -3.236161470413208, + -2.1347084045410156, + -3.802447557449341, + -2.585503339767456, + -3.9361765384674072, + -2.290905714035034, + -3.714280605316162, + -1.793616533279419, + -3.7252492904663086, + -1.450188159942627, + -3.11938738822937, + -1.25174880027771 + ] + }, + "32": { + "input_prompt": "create a conversational article", + "generated_text": " about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article.", + "generated_tokens": [ + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046 + ], + "latency": 47.09983468055725, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.4165568351745605, + -11.358176231384277, + -0.0701780766248703, + -7.797665119171143, + -2.6805992126464844, + -1.4707680940628052, + -3.0390255451202393, + -1.6902849674224854, + -1.270594835281372, + -1.1936196088790894, + -1.2523558139801025, + -2.7270259857177734, + -1.2371309995651245, + -0.9618493318557739, + -0.4379909038543701, + -1.3917063474655151, + -1.1055524349212646, + -0.9122569561004639, + -0.9911308288574219, + -0.08436793833971024, + -0.5424078106880188, + -0.9181017279624939, + -0.5873759388923645, + -0.19014373421669006, + -0.06655456870794296, + -0.15252672135829926, + -0.09415211528539658, + -0.009787309914827347, + -0.013910251669585705, + -0.005296128336340189, + -0.005677408073097467, + -0.02013739012181759, + -0.21594694256782532, + -0.07153760641813278, + -0.0066444179974496365, + -0.010198505595326424, + -0.011980246752500534, + -0.003686776151880622, + -0.0037619550712406635, + -0.0022467151284217834, + -0.004088377580046654, + -0.021828632801771164, + -0.0012669878778979182, + -0.09768074005842209, + -0.02652405947446823, + -0.0019286142196506262, + -0.002283824374899268, + -0.0032225127797573805, + -0.0009741804678924382, + -0.0009415484382770956, + -0.001211624126881361, + -0.001135300612077117, + -0.002340436913073063, + -0.0010846928926184773, + -0.0509282611310482, + -0.03832047060132027, + -0.00257422705180943, + -0.0022806129418313503, + -0.00262785074301064, + -0.0008195855189114809, + -0.0010239601833745837, + -0.0013777059502899647, + -0.0009899006690829992, + -0.0018756669014692307, + -0.0015304292319342494, + -0.08506463468074799, + -0.01893703266978264, + -0.0013797297142446041, + -0.0014461545506492257, + -0.0013971101725474, + -0.0005869334563612938, + -0.0005212855176068842, + -0.000876757490914315, + -0.0005256939912214875, + -0.0012863941956311464, + -0.0015691122971475124, + -0.051276568323373795, + -0.00973513163626194, + -0.0010469438275322318, + -0.0011531615164130926, + -0.0009969270322471857, + -0.00038342276820912957, + -0.0004032037395518273, + -0.000730247818864882, + -0.0003275334893260151, + -0.0008700875914655626, + -0.0017572689102962613, + -0.044393111020326614, + -0.013102858327329159, + -0.0011463745031505823, + -0.0012070996453985572, + -0.0012325793504714966, + -0.0005048430757597089, + -0.0004876854654867202, + -0.0007901645149104297, + -0.00041500062798149884, + -0.0009869233472272754, + -0.0018687656847760081, + -0.03943866863846779, + -0.014425630681216717, + -0.0014756753807887435, + -0.001423775334842503, + -0.001209719106554985, + -0.0005046047735959291, + -0.00042167355422861874, + -0.0007688426994718611, + -0.0002699726028367877, + -0.0006598440813831985, + -0.0017849955474957824, + -0.038999658077955246, + -0.012665312737226486, + -0.0014427024871110916, + -0.0014492495683953166, + -0.001016576774418354, + -0.00042083943844772875, + -0.00033241944038309157, + -0.0006403064471669495, + -0.00022373080719262362, + -0.0007053509471006691, + -0.0018597226589918137, + -0.030997740104794502, + -0.011259939521551132, + -0.0012655591126531363, + -0.00134151556994766, + -0.0008106521563604474, + -0.0003513672563713044, + -0.0002964295563288033, + -0.0006368515896610916, + -0.00020180096908006817, + -0.0005779979983344674, + -0.0016014858847483993, + -0.0271126888692379 + ] + }, + "64": { + "input_prompt": "App Concept: \"Eggy's Interactive Adventure World\"\n\nEggy's Interactive Adventure World is a cutting-edge app designed for phones and tablets that combines immersive storytelling with engaging, interactive experiences for both parents and children. This innovative app allows users to explore the world of Eggy and participate in a series of adventures that teach essential life skills, foster personal growth, and strengthen parent-child bonds.\n\nKey Features:\n\n1. Interactive Stories: The app brings the 20 Eggy stories to life through interactive, animated storytelling. Users can read, listen, and actively participate in the stories, making choices that guide Eggy through various adventures, each with unique lessons and experiences.\n2. Augmented Reality (AR) Integration: The app utilizes AR technology, allowing users to interact with Eggy and the story characters in their real-world environment. This immersive experience creates a deeper connection with the stories and encourages engagement.\n3. Personalized Adventures: The app customizes each story based on the user's interests, needs, and personal growth areas. By collecting user input, the app tailors the storylines and learning outcomes to ensure that each adventure resonates with the child's unique experiences.\n4. Parent-Child Cooperative Play: The app encourages parents and children to collaborate and engage in the stories together. Through a split-screen mode or multi-device connectivity, parents and children can actively participate in the stories and learn essential life skills together.\n5. Skill-Building Mini-Games: Interspersed throughout the stories are educational mini-games that reinforce the lessons and themes. These games are designed to be fun, engaging, and accessible to children, encouraging them to practice and develop various skills.\n6. Progress Tracking and Rewards: The app tracks the user's progress through the stories and rewards them with badges, trophies, and other collectibles. This gamification aspect encourages continued engagement and fosters a sense of accomplishment and growth.\n7. Community Features: The app provides a safe, moderated online community where parents can share their experiences, discuss the stories and lessons, and connect with other families who are also using the app. This community fosters support, inspiration, and learning.\n8. Expert Resources: The app includes access to resources from experts in child development, psychology, and education, providing parents with additional tools and guidance on fostering the growth and well-being of their children.\n\nEggy's Interactive Adventure World is a one-of-a-kind, mind-blowing app that revolutionizes the way parents and children engage with stories and learn essential life skills. Combining the power of storytelling, cutting-edge technology, and expert resources, this app is a must-have for families looking to strengthen their bonds and support their children's personal growth.", + "generated_text": "The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season was the 10th season was the 10 season was the 10 season was the 10", + "generated_tokens": [ + 2, + 1784, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048 + ], + "latency": 85.16301679611206, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -12.107745170593262, + -2.9727728366851807, + -3.720092535018921, + -5.592433929443359, + -10.964235305786133, + -3.654498338699341, + -9.33439826965332, + -4.833785057067871, + -5.187321662902832, + -2.6944785118103027, + -6.9262237548828125, + -0.654232919216156, + -0.5550781488418579, + -0.21346639096736908, + -0.0134271876886487, + -0.010840100236237049, + -1.3878544569015503, + -0.6296291351318359, + -7.9766011238098145, + -0.4393192231655121, + -5.639142036437988, + -3.277270793914795, + -1.0206468105316162, + -11.703084945678711, + -0.7100943922996521, + -0.2809169888496399, + -2.771284818649292, + -7.190817832946777, + -4.048691749572754, + -0.012056218460202217, + -3.3802318572998047, + -0.6807184815406799, + -3.4844107627868652, + -3.312331199645996, + -0.5001641511917114, + -2.61255145072937, + -4.243694305419922, + -4.333778381347656, + -6.0625810623168945, + -0.011777156963944435, + -0.37577226758003235, + -0.9490834474563599, + -3.5450198650360107, + -2.1778035163879395, + -0.45957911014556885, + -3.00771164894104, + -1.7600425481796265, + -0.09766030311584473, + -2.467618942260742, + -1.329679012298584, + -0.8384320735931396, + -1.1864604949951172, + -3.628342866897583, + -0.2470003068447113, + -1.8938640356063843, + -5.168431282043457, + -0.05005566030740738, + -2.258014678955078, + -2.449028968811035, + -0.0034086955711245537, + -3.9485883712768555, + -1.6201664209365845, + -5.139942646026611, + -4.859354496002197, + -0.23686674237251282, + -0.5541543364524841, + -2.5826025009155273, + -6.114635467529297, + -4.3380208015441895, + -0.7412900924682617, + -0.3221715986728668, + -0.13805493712425232, + -4.1797332763671875, + -7.3456268310546875, + -0.13762745261192322, + -2.0905232429504395, + -1.0178627967834473, + -4.108260631561279, + -0.6007124185562134, + -1.0410642623901367, + -4.122039794921875, + -0.35905471444129944, + -1.4274661540985107, + -4.139932155609131, + -0.4237431585788727, + -1.6294409036636353, + -0.9811424016952515, + -4.132790565490723, + -1.1318120956420898, + -6.8258256912231445, + -1.5455098152160645, + -0.6984409093856812, + -13.664215087890625, + -0.1166313961148262, + -1.6347849369049072, + -0.28875046968460083, + -0.03130083531141281, + -1.5293006896972656, + -1.6488375663757324, + -4.224111557006836, + -4.760683059692383, + -1.9758747816085815, + -1.5828256607055664, + -2.8463857173919678, + -0.2620386481285095, + -1.7243889570236206, + -1.7945923805236816, + -0.8884308338165283, + -0.3766394555568695, + -0.34033581614494324, + -9.05566692352295, + -0.22754782438278198, + -0.033802058547735214, + -0.34108465909957886, + -0.5644669532775879, + -2.0925779342651367, + -4.547505855560303, + -10.870464324951172, + -1.1072022914886475, + -5.503787994384766, + -3.259672164916992, + -0.007964519783854485, + -3.0111639499664307, + -4.246737480163574, + -0.7813188433647156, + -3.331031322479248, + -4.485962867736816, + -0.9492117166519165, + -2.6757047176361084, + -1.1591349840164185, + -1.122117519378662, + -2.629878044128418, + -5.986321926116943, + -0.2146703153848648, + -0.002392764901742339, + -7.372479438781738, + -0.007077385671436787, + -0.06599216908216476, + -0.0970711037516594, + -3.2874932289123535, + -0.0019583588000386953, + -0.9122000336647034, + -4.930907249450684, + -0.019508399069309235, + -0.308611661195755, + -0.07778516411781311, + -3.8497893810272217, + -0.46124517917633057, + -0.38821348547935486, + -2.668412208557129, + -1.845987319946289, + -0.06470083445310593, + -0.006619549356400967, + -1.2610487937927246, + -0.13015533983707428, + -3.365312099456787, + -0.0014690094394609332, + -1.6789823770523071, + -1.2499005794525146, + -3.3992111682891846, + -5.563300132751465, + -0.823418140411377, + -4.24124813079834, + -1.6597849130630493, + -0.6941139698028564, + -1.5637556314468384, + -0.5482053756713867, + -0.9507225751876831, + -3.764758586883545, + -0.0006518622976727784, + -0.7540555000305176, + -5.058262825012207, + -0.3302401602268219, + -2.8130555152893066, + -0.17079885303974152, + -2.871047019958496, + -0.3991694450378418, + -3.1476998329162598, + -0.3488404452800751, + -2.0545666217803955, + -4.201597690582275, + -5.164614677429199, + -0.0271432027220726, + -0.0009785869624465704, + -3.3444161415100098, + -1.3117046356201172, + -6.375423431396484, + -0.05535568296909332, + -0.3919340968132019, + -0.060594215989112854, + -6.507473468780518, + -0.0023910999298095703, + -2.143423318862915, + -3.335618257522583, + -2.953970432281494, + -0.0013383012264966965, + -0.8080525398254395, + -0.29526084661483765, + -0.04036511853337288, + -3.231475353240967, + -1.0585589408874512, + -6.136373043060303, + -0.006182829383760691, + -0.035548023879528046, + -5.509808540344238, + -1.8490750789642334, + -9.83314037322998, + -0.07037576287984848, + -3.1621387004852295, + -6.762360095977783, + -1.3490527868270874, + -3.601043462753296, + -1.176393985748291, + -0.4342959523200989, + -0.06266004592180252, + -5.464046001434326, + -0.017946599051356316, + -1.0416009426116943, + -1.6117159128189087, + -12.289417266845703, + -1.5004339218139648, + -5.76563835144043, + -4.038386821746826, + -0.20812086760997772, + -3.6306562423706055, + -1.3901070356369019, + -1.087137222290039, + -2.423213243484497, + -4.503086090087891, + -0.0008031480247154832, + -0.03627370297908783, + -0.1653430461883545, + -7.958648681640625, + -1.1018548011779785, + -1.290948748588562, + -3.8049263954162598, + -1.8253734111785889, + -0.059022851288318634, + -0.0013984196120873094, + -4.698851585388184, + -2.5421664714813232, + -0.024493809789419174, + -4.828659534454346, + -3.0295286178588867, + -3.550312042236328, + -0.1185273677110672, + -0.22595760226249695, + -0.10782183706760406, + -1.4033282995224, + -0.4485701024532318, + -0.2889708876609802, + -0.05471855774521828, + -0.007632025051862001, + -2.1156554222106934, + -0.6249589323997498, + -4.198577404022217, + -0.14178156852722168, + -4.284021377563477, + -2.227515935897827, + -3.5022120475769043, + -0.19575819373130798, + -15.964509963989258, + -4.055960655212402, + -11.125024795532227, + -0.7681724429130554, + -3.0436902046203613, + -7.030262470245361, + -4.376729488372803, + -5.476145267486572, + -0.4219042658805847, + -3.7689766883850098, + -0.060010604560375214, + -0.8134393692016602, + -0.11386934667825699, + -0.025473715737462044, + -0.09736856073141098, + -4.357361793518066, + -0.3670865297317505, + -0.08063744008541107, + -0.1311480849981308, + -1.0903867483139038, + -1.2705107927322388, + -1.5076212882995605, + -4.295275688171387, + -0.04185756668448448, + -0.19810955226421356, + -1.9645220041275024, + -0.9597910642623901, + -0.13429655134677887, + -0.002283110748976469, + -7.066074371337891, + -3.639211654663086, + -1.0263917446136475, + -8.124760627746582, + -1.132537841796875, + -0.09160765260457993, + -0.08996370434761047, + -10.165366172790527, + -3.501585006713867, + -0.0019847711082547903, + -0.05309417471289635, + -0.31209683418273926, + -0.15089339017868042, + -1.23564875125885, + -1.2685208320617676, + -7.832758903503418, + -0.19271136820316315, + -0.014305183663964272, + -0.0007532381569035351, + -0.44688940048217773, + -2.6239724159240723, + -1.738666296005249, + -1.6480977535247803, + -0.46753185987472534, + -8.656959533691406, + -3.79868483543396, + -0.9281394481658936, + -2.2381181716918945, + -1.7654449939727783, + -0.4948798418045044, + -0.025028761476278305, + -1.5435361862182617, + -1.6390818357467651, + -1.4962153434753418, + -0.3425217270851135, + -0.013077914714813232, + -0.038474079221487045, + -5.3364362716674805, + -0.42365288734436035, + -1.884093999862671, + -3.510357618331909, + -6.198029518127441, + -0.44375038146972656, + -0.0008789013954810798, + -3.6025230884552, + -1.419615626335144, + -2.6723289489746094, + -5.775190830230713, + -1.1380761861801147, + -2.6683366298675537, + -0.43395891785621643, + -0.003145867260172963, + -8.63144302368164, + -1.646262764930725, + -1.732487678527832, + -4.561546802520752, + -0.5277953147888184, + -0.07333153486251831, + -0.5624169707298279, + -0.12201295047998428, + -2.6561455726623535, + -1.1071691513061523, + -2.6895060539245605, + -0.040864069014787674, + -0.04126371443271637, + -1.8294739723205566, + -0.09022177755832672, + -0.3154001832008362, + -0.46215569972991943, + -2.2462844848632812, + -0.30149081349372864, + -0.52588951587677, + -8.288043975830078, + -0.0002057340752799064, + -0.8021711707115173, + -4.4546098709106445, + -0.0001565095444675535, + -0.0015961299650371075, + -0.15216240286827087, + -0.3677564561367035, + -5.018707275390625, + -0.7850045561790466, + -1.9582659006118774, + -1.0046892166137695, + -10.0401029586792, + -0.16878114640712738, + -5.944240570068359, + -1.5523078441619873, + -5.7253522872924805, + -0.47948503494262695, + -0.44009655714035034, + -5.671053886413574, + -0.003280022880062461, + -0.7937742471694946, + -0.9639376401901245, + -0.00030048147891648114, + -1.0747740268707275, + -0.8839919567108154, + -3.416811466217041, + -1.6602673530578613, + -0.2706959843635559, + -0.0024333172477781773, + -4.478696823120117, + -6.20179557800293, + -0.11359559744596481, + -0.202009916305542, + -0.022310219705104828, + -2.367263078689575, + -1.0405994653701782, + -5.984308242797852, + -2.105138063430786, + -9.583202362060547, + -0.0004957877099514008, + -3.0655455589294434, + -0.0669412910938263, + -0.8977450728416443, + -2.2271294593811035, + -2.6617536544799805, + -1.8184051513671875, + -0.8291114568710327, + -0.4864235818386078, + -0.7993525862693787, + -3.51106858253479, + -2.1530935764312744, + -0.257144957780838, + -1.3934082984924316, + -1.3137131929397583, + -0.3384077548980713, + -0.1697217971086502, + -2.353395938873291, + -0.03406282886862755, + -0.39059701561927795, + -3.422821044921875, + -1.7117210626602173, + -0.7018465399742126, + -1.5995906591415405, + -3.6218395233154297, + -0.12497704476118088, + -0.16966234147548676, + -0.7313685417175293, + -0.4956285357475281, + -1.0840849876403809, + -5.042126655578613, + -0.00031704644788987935, + -7.683258056640625, + -0.9210801720619202, + -4.687852382659912, + -0.0028814247343689203, + -0.043382611125707626, + -4.1948652267456055, + -2.66593337059021, + -0.06153333932161331, + -0.0023110604379326105, + -6.729236602783203, + -5.777127742767334, + -0.08932067453861237, + -0.09890018403530121, + -0.009886111132800579, + -3.1145148277282715, + -3.725565195083618, + -0.0021998509764671326, + -3.9927196502685547, + -2.753793239593506, + -1.6037236452102661, + -0.17461130023002625, + -4.804804801940918, + -0.2311229705810547, + -0.30256444215774536, + -2.235363006591797, + -0.006614102050662041, + -0.34757524728775024, + -1.4946835041046143, + -1.222062587738037, + -3.658839225769043, + -1.356170892715454, + -0.5371109843254089, + -3.7580835819244385, + -4.54621696472168, + -0.31577637791633606, + -3.677156925201416, + -2.7181396484375, + -7.4674882888793945, + -0.00019369633810129017, + -2.3798398971557617, + -2.5452184677124023, + -0.2858496308326721, + -4.315659523010254, + -0.025835415348410606, + -0.000603493710514158, + -0.2546294331550598, + -0.12032663822174072, + -2.006908655166626, + -5.990736961364746, + -7.146596908569336, + -0.23356498777866364, + -0.2201036810874939, + -0.01235415879637003, + -0.011248741298913956, + -1.4155778884887695, + -0.40242519974708557, + -5.877886772155762, + -0.7865053415298462, + -0.03231288120150566, + -0.004864405374974012, + -0.0050629740580916405, + -2.7049152851104736, + -6.822089195251465, + -0.39252761006355286, + -1.2290617227554321, + -0.007630132604390383, + -3.485461711883545, + -0.47985684871673584, + -6.1813530921936035, + -0.03757825121283531, + -0.37834712862968445, + -0.22192610800266266, + -1.165318489074707, + -0.5220151543617249, + -0.1289423257112503, + -3.216222047805786, + -1.0787583589553833, + -3.0716826915740967, + -0.6023419499397278, + -2.558605194091797, + -0.927433431148529, + -0.00364841241389513, + -0.14910078048706055, + -0.7318926453590393, + -6.159773826599121, + -0.0015301911626011133, + -1.8908276557922363, + -1.9641315937042236, + -0.021651331335306168, + -2.1648828983306885, + -2.2700207233428955, + -7.833290100097656, + -0.03397307172417641, + -0.8344621658325195, + -0.02225659228861332, + -0.06639260798692703, + -2.3780317306518555, + -3.180129051208496, + -0.09030630439519882, + -2.4138312339782715, + -1.3445552587509155, + -1.848326325416565, + -0.9726964831352234, + -2.851792335510254, + -0.0630769282579422, + -0.0011394681641831994, + -0.05843213573098183, + -2.6616668701171875, + -1.575437068939209, + -0.180197611451149, + -5.552371501922607, + -0.26108410954475403, + -2.529611587524414, + -0.37780019640922546, + -5.141795635223389, + -0.5921107530593872, + -0.2474975287914276, + -0.10687454044818878, + -4.891775131225586, + -0.25011152029037476, + -2.4100728034973145, + -1.358667016029358, + -2.790961503982544, + -3.8654675483703613, + -1.0076243877410889, + -0.7456949949264526, + -1.5575554370880127, + -2.05328631401062, + -1.6538066864013672, + -0.0558217354118824, + -0.0001817776501411572, + -0.0011643542675301433, + -0.038359593600034714, + -1.4208931922912598, + -0.542127251625061, + -0.3162364959716797, + -0.3966117799282074, + -1.1765563488006592, + -1.7920958995819092, + -0.18425509333610535, + -0.1092008650302887, + -0.46676987409591675, + -0.24977745115756989, + -1.0375996828079224, + -0.5268858671188354, + -0.008942908607423306, + -0.6404479146003723, + -0.0033111530356109142, + -5.3165931603871286e-05, + -0.5154370665550232, + -0.39286962151527405, + -1.401839256286621, + -0.6232213973999023, + -0.02168831042945385, + -0.004282470792531967, + -0.005199837032705545, + -0.09748794883489609, + -0.040823787450790405, + -0.00014852374442853034, + -0.0005832401220686734, + -0.005303124897181988, + -0.6537013053894043, + -0.38026049733161926, + -0.04189129173755646, + -0.010385753586888313, + -0.008756335824728012, + -0.013362848199903965, + -0.000504723924677819, + -0.002797620603814721, + -0.0014512732159346342, + -0.0013321106089279056, + -0.010883613489568233, + -0.005159396678209305, + -0.004701037425547838, + -0.01591104455292225, + -0.001474246964789927, + -1.2278481335670222e-05, + -0.010548785328865051, + -0.08341525495052338, + -0.03858809545636177, + -0.056062061339616776, + -0.0009532198309898376, + -0.0005789510905742645, + -0.0008986725588329136, + -0.00710969977080822, + -0.0006561510381288826, + -1.4781842764932662e-05, + -5.578839045483619e-05, + -0.0006398299592547119, + -0.0028786908369511366, + -0.0034092895220965147, + -0.008268529549241066, + -0.006602259818464518, + -0.004517706111073494, + -0.02233586646616459, + -0.0006323245470412076, + -0.009195122867822647, + -0.0029284947086125612, + -0.004457537550479174, + -0.017873765900731087, + -0.008801711723208427, + -0.0036383166443556547, + -0.08078611642122269, + -0.006347495596855879, + -0.0002177716523874551, + -0.04688572511076927, + -0.2718890309333801, + -0.07996802777051926, + -0.04674842208623886, + -0.009984076954424381, + -0.010000954382121563, + -0.050126753747463226, + -0.5864179730415344, + -0.2915390133857727, + -0.008090462535619736, + -0.032190412282943726, + -0.03461571782827377, + -0.2785419523715973, + -0.05830562859773636, + -0.02893950417637825, + -0.12241066247224808, + -0.02711048536002636, + -0.16450686752796173, + -0.09852994978427887, + -0.2651047706604004, + -0.35559725761413574, + -0.12606258690357208, + -0.32793670892715454, + -0.20878805220127106, + -0.7587923407554626, + -0.4730657637119293, + -1.496794581413269, + -0.2486363798379898, + -0.20723387598991394, + -0.1872958242893219, + -0.19151091575622559, + -0.3350580036640167, + -1.3085839748382568, + -0.6109102964401245, + -0.2947172224521637, + -0.37130236625671387, + -0.19888469576835632, + -0.18297068774700165, + -0.9978674054145813, + -0.5471905469894409, + -0.4379975199699402, + -0.407988041639328, + -0.17325688898563385, + -0.16938896477222443 + ] + }, + "96": { + "input_prompt": "what is the concept of double materiality in sustainability?", + "generated_text": " What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double material", + "generated_tokens": [ + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318 + ], + "latency": 123.03724575042725, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.917365074157715, + -0.9960631132125854, + -7.875392913818359, + -0.2993181347846985, + -7.760880470275879, + -10.308395385742188, + -2.1807961463928223, + -1.6412583589553833, + -9.521512985229492, + -1.627489447593689, + -1.8410861492156982, + -0.9285702705383301, + -0.2576955556869507, + -0.9641067981719971, + -0.02314644306898117, + -0.6696561574935913, + -0.07035009562969208, + -0.004622488282620907, + -0.025748632848262787, + -0.06276137381792068, + -0.17385317385196686, + -0.3285445272922516, + -0.0592009499669075, + -0.007940039038658142, + -0.22664028406143188, + -0.0017957051750272512, + -0.022929180413484573, + -0.005733947269618511, + -0.0012996093137189746, + -0.006419987417757511, + -0.02376849390566349, + -0.27800270915031433, + -0.4650723934173584, + -0.04936715215444565, + -0.003972141072154045, + -0.01477995328605175, + -0.0012044801842421293, + -0.014891182072460651, + -0.002709767082706094, + -0.0009939497103914618, + -0.0028436246793717146, + -0.006759870797395706, + -0.15416178107261658, + -0.20121537148952484, + -0.016414370387792587, + -0.0015769677702337503, + -0.008138825185596943, + -0.0007713441736996174, + -0.013819841668009758, + -0.003826678032055497, + -0.0005918181850574911, + -0.0014938872773200274, + -0.00485716899856925, + -0.081083282828331, + -0.09642580896615982, + -0.009630884043872356, + -0.0010948146227747202, + -0.007085552904754877, + -0.0006310140597634017, + -0.013073914684355259, + -0.0039152647368609905, + -0.000364713923772797, + -0.001292108790948987, + -0.004158303141593933, + -0.044283974915742874, + -0.05722038820385933, + -0.006369172595441341, + -0.0007976687629707158, + -0.005993015132844448, + -0.0004935238393954933, + -0.011310506612062454, + -0.002951553324237466, + -0.000387831823900342, + -0.000977038755081594, + -0.0036971091758459806, + -0.030511993914842606, + -0.04246694967150688, + -0.004863100592046976, + -0.0006927236099727452, + -0.005206122528761625, + -0.0005129451747052372, + -0.00894621666520834, + -0.0028565814718604088, + -0.00041333239641971886, + -0.0009002208826132119, + -0.0033131728414446115, + -0.021188799291849136, + -0.03330245241522789, + -0.0038543473929166794, + -0.0006504327175207436, + -0.004474864806979895, + -0.00048029806930571795, + -0.009718249551951885, + -0.0030443770810961723, + -0.0003743662964552641, + -0.0009439303539693356, + -0.003729770192876458, + -0.016505014151334763, + -0.0290373582392931, + -0.003315192647278309, + -0.0005821678787469864, + -0.004148805979639292, + -0.00042489083716645837, + -0.006856840569525957, + -0.0028660909738391638, + -0.00032574593205936253, + -0.0006986799417063594, + -0.003671098267659545, + -0.012792548164725304, + -0.02553274855017662, + -0.002730690874159336, + -0.0005067494930699468, + -0.0036923582665622234, + -0.0004451475979294628, + -0.007225453853607178, + -0.002746859099715948, + -0.0003323002893012017, + -0.0008517451351508498, + -0.003630714723840356, + -0.011284693144261837, + -0.02186688780784607, + -0.0025819556321948767, + -0.0004931663861498237, + -0.0031721293926239014, + -0.00040725519647821784, + -0.0062383925542235374, + -0.00238693761639297, + -0.00040749352774582803, + -0.0007970731821842492, + -0.003604583442211151, + -0.010075542144477367, + -0.022386692464351654, + -0.0025295186787843704, + -0.00046302087139338255, + -0.003138143103569746, + -0.0004233417857903987, + -0.006178564392030239, + -0.0021037368569523096 + ] + }, + "127": { + "input_prompt": "as before we talk about using , i usually do if else() to in case the property would be null before.\nif today i public a property and i want to use using to instead , will code be like this?\n\npublic Material Mat;\n\nusing(Mat){\n// Do something;\n}\n\nis that right?", + "generated_text": " i am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using", + "generated_tokens": [ + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505 + ], + "latency": 151.56505846977234, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.793755531311035, + -4.224940776824951, + -5.446587562561035, + -0.267395555973053, + -6.118141174316406, + -11.105525970458984, + -6.67517614364624, + -6.902530670166016, + -3.2218151092529297, + -5.99644660949707, + -8.431673049926758, + -8.04836368560791, + -3.992594003677368, + -7.550079345703125, + -2.873685836791992, + -3.3877997398376465, + -7.9306745529174805, + -7.005484580993652, + -1.0481306314468384, + -2.686237335205078, + -6.131283760070801, + -6.2994704246521, + -7.931419372558594, + -11.3147554397583, + -8.544670104980469, + -12.01729679107666, + -3.89847469329834, + -1.7964364290237427, + -2.952878952026367, + -1.9217232465744019, + -2.272329330444336, + -0.37552154064178467, + -1.667820692062378, + -7.510344505310059, + -3.498040199279785, + -7.980632305145264, + -7.672002792358398, + -4.4999470710754395, + -7.155375003814697, + -2.4486124515533447, + -4.785946846008301, + -1.153855800628662, + -2.3994438648223877, + -4.250652313232422, + -12.24446964263916, + -8.344388008117676, + -2.608186721801758, + -5.200589179992676, + -8.25888442993164, + -3.6245617866516113, + -7.689338207244873, + -7.345355033874512, + -1.2661759853363037, + -7.265620231628418, + -1.9884108304977417, + -6.269482612609863, + -2.41705584526062, + -1.8929681777954102, + -1.8259913921356201, + -2.0997350215911865, + -2.323200225830078, + -1.3998825550079346, + -0.8789899945259094, + -1.082053542137146, + -1.1831339597702026, + -1.4462857246398926, + -1.6481035947799683, + -1.4408715963363647, + -1.2603964805603027, + -1.5267670154571533, + -1.6345772743225098, + -1.3796477317810059, + -0.7609691023826599, + -0.3548354506492615, + -0.7552334666252136, + -0.44776833057403564, + -1.1078286170959473, + -1.3036658763885498, + -0.5214896202087402, + -0.8486822843551636, + -0.22470997273921967, + -0.4705755412578583, + -0.5639711022377014, + -0.5388108491897583, + -0.6052999496459961, + -0.1002030223608017, + -0.286334365606308, + -0.45798981189727783, + -1.0107953548431396, + -0.11875647306442261, + -0.6969441771507263, + -0.4609107971191406, + -0.07614769786596298, + -0.5035472512245178, + -0.1682187020778656, + -0.10476160794496536, + -0.6586751341819763, + -0.35806939005851746, + -1.5364394187927246, + -2.4093759059906006, + -1.977368950843811, + -1.6216907501220703, + -0.27647316455841064, + -0.2991848587989807, + -0.2783535420894623, + -0.05913994088768959, + -0.03023873083293438, + -0.043339803814888, + -0.7320341467857361, + -0.0030677898321300745, + -0.0332595594227314, + -0.012804670259356499, + -0.004041599575430155, + -0.0014899593079462647, + -0.001948602613992989, + -0.0029070996679365635, + -0.040939707309007645, + -0.013942227698862553, + -0.04897322878241539, + -0.011005887761712074, + -0.0044113704934716225, + -0.0013179434463381767, + -0.003658389439806342, + -0.009758152067661285, + -0.0014104428701102734, + -0.0016671819612383842, + -0.000771939754486084, + -0.0015519729349762201, + -0.003720743814483285, + -0.004249115474522114, + -0.00485657574608922, + -0.005053604021668434, + -0.002336274366825819, + -0.0009155849111266434, + -0.0004978132783435285, + -0.0005953923100605607, + -0.0011395872570574284, + -0.001485078944824636, + -0.3072909712791443, + -1.7295066118240356, + -0.4807289242744446, + -0.1245415136218071, + -0.011858444660902023, + -0.020613837987184525, + -0.011020978912711143, + -0.003106294432654977, + -0.0009966888464987278, + -0.0019349202048033476, + -0.037407051771879196, + -0.0003496989083942026, + -0.005922981072217226, + -0.007394562941044569, + -0.0006037319544702768, + -0.0008836655179038644, + -0.0002884448622353375, + -0.00047600860125385225, + -0.0024947968777269125, + -0.00442774873226881, + -0.004059052560478449, + -0.0018594847060739994, + -0.0006179092451930046, + -0.00022635281493421644, + -0.0006730675231665373, + -0.003022746881470084, + -0.0002343380037928, + -0.00047791501856409013, + -9.440929716220126e-05, + -0.00021550717065110803, + -0.0013523490633815527, + -0.0032202552538365126, + -0.001157686347141862, + -0.004449942149221897, + -0.0016590891173109412, + -0.00101062236353755, + -0.0003079893649555743, + -0.00048375347978435457, + -0.0021734442561864853, + -0.00423036003485322, + -0.11514264345169067, + -0.8658493757247925, + -0.084366075694561, + -0.02140468917787075, + -0.0060798698104918, + -0.008638513274490833, + -0.003212531330063939, + -0.0009598892065696418, + -0.00032085992279462516 + ] + }, + "throughput": [107.66332959870442, 107.66332959870442] +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_583m_logitsmatch/model_config.yaml new file mode 100644 index 00000000000..9229a0869ae --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_583m_logitsmatch/model_config.yaml @@ -0,0 +1,66 @@ +# Mirrors legacy test_case: gpt_dynamic_inference_tp1_pp1_583m_logitsmatch +# (driven by examples/inference/legacy/gpt_dynamic_inference.py). +# This case drives examples/inference/offline_inference.py in async direct mode +# (--async-mode, no --use-coordinator). Goldens copied from the legacy case; +# async path must produce numerically identical output to the sync path. +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: frozen-start +MODE: inference +MODEL_ARGS: + --tiktoken-pattern: v2 + --use-mcore-models: true + --tokenizer-type: TikTokenizer + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --auto-detect-ckpt-format: true + --max-tokens-to-oom: 3600000 + --inference-max-seq-length: 4096 + --attention-backend: flash + --use-checkpoint-args: true + --micro-batch-size: 1 + --no-load-optim: true + --no-use-tokenizer-model-from-checkpoint-args: true + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1 + --distributed-backend: nccl + --log-interval: 1 + --transformer-impl: transformer_engine + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 + --deterministic-mode: true + --ckpt-format: torch_dist + --bf16: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --num-layers: 24 + --hidden-size: 1152 + --num-attention-heads: 16 + --max-position-embeddings: 1024 + --seq-length: 1024 + --temperature: 1.0 + --top_k: 1 + --return-log-probs: true + --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility + --inference-dynamic-batching-buffer-guaranteed-fraction: 0.05 + --inference-dynamic-batching-buffer-overflow-factor: 1.2 + --inference-dynamic-batching-buffer-size-gb: 20 + --dist-ckpt-strictness: log_unexpected + --inference-ckpt-non-strict: true # To handle the extra_state errors + --output-path: ${INFERENCE_OUTPUT_PATH} + --output-every-n-results: 32 + --prompt-file: ${DATA_PATH}/text/sharegpt-vicuna/filtered/processed.jsonl + --prompt-file-num-truncate: 128 # originally 1024 + --num-tokens-to-generate: 128 # originally 512 + --incoming-requests-per-step: 32 + --termination-id: -1 + --inference-repeat-n: 3 + --inference-logging-step-interval: 1 + --async-mode: true +METRICS: + - "generated_tokens" + - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_dp8_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_dp8_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..92eb5c6cab0 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_dp8_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json @@ -0,0 +1,158 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", + "generated_tokens": [ + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710, + 1402, + 14019, + 1044, + 1321, + 1402, + 14019, + 1294, + 1278, + 2725, + 15568, + 3039, + 1046, + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710 + ], + "latency": 1.964757651090622, + "logprobs": [ + -9.358587265014648, + -2.7594826221466064, + -4.608366012573242, + -1.4093360900878906, + -0.6152952313423157, + -1.7217562198638916, + -2.496668815612793, + -2.0547454357147217, + -2.441960573196411, + -6.280838966369629, + -1.5643692016601562, + -3.462346076965332, + -4.428728103637695, + -3.8633861541748047, + -1.9936373233795166, + -1.8929449319839478, + -3.796365737915039, + -6.8360137939453125, + -0.2901247441768646, + -0.9246833324432373, + -6.633338928222656, + -7.166708469390869, + -12.771251678466797, + -2.198296308517456, + -3.7778120040893555, + -0.4983733296394348, + -4.381269454956055, + -0.0666784718632698, + -0.09580295532941818, + -3.2437636852264404, + -10.079947471618652, + -1.172220230102539, + -5.977442741394043, + -5.046236038208008, + -3.855658531188965, + -2.5585858821868896, + -3.356245994567871, + -5.557229518890381, + -1.6787731647491455, + -5.483290672302246, + -12.218501091003418, + -12.61402702331543, + -0.09662941098213196, + -2.5431432723999023, + -1.4071024656295776, + -2.9154715538024902, + -1.1964417695999146, + -0.006458481773734093, + -3.3625335693359375, + -13.262511253356934, + -4.314079761505127, + -2.617699146270752, + -5.987792015075684, + -0.778266429901123, + -0.048888545483350754, + -1.548882007598877, + -1.1381981372833252, + -5.627166748046875, + -0.4078553318977356, + -4.958505630493164, + -0.6187160611152649, + -0.7174848914146423, + -2.469533920288086, + -13.620073318481445, + -0.09088654816150665, + -3.526974678039551, + -1.4195809364318848, + -6.402483940124512, + -0.5898402333259583, + -3.565917491912842, + -0.8561318516731262, + -1.6140165328979492, + -5.370549201965332, + -17.159223556518555, + -6.583524703979492, + -0.8855001926422119, + -4.19431209564209, + -1.2012220621109009, + -2.2563133239746094, + -1.7674944400787354, + -0.22064533829689026, + -9.292220115661621, + -0.12445646524429321, + -7.29617977142334, + -2.526529312133789, + -4.071560859680176, + -3.5568013191223145, + -1.926215410232544, + -2.349026918411255, + -2.2132363319396973, + -0.3125414550304413, + -1.4718132019042969, + -2.149106740951538, + -1.0855519771575928, + -1.631832242012024, + -1.3751734495162964, + -1.9396103620529175, + -1.5293723344802856, + -0.8444125056266785, + -1.2414811849594116, + -1.9522171020507812, + -2.4338042736053467, + -1.5651824474334717, + -0.9498789310455322, + -1.8044980764389038, + -2.356677770614624, + -1.247452974319458, + -1.550165057182312, + -0.5635553598403931, + -0.6177330017089844, + -0.4778785705566406, + -0.020452087745070457, + -0.48500269651412964, + -0.23854275047779083, + -0.06543659418821335, + -0.11837350577116013, + -0.0585334412753582 + ] + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml new file mode 100644 index 00000000000..c0c831c7e1b --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_async_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml @@ -0,0 +1,62 @@ +# Mirrors legacy test_case: gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq +# (driven by examples/inference/legacy/gpt_dynamic_inference_with_coordinator.py). +# This case drives examples/inference/offline_inference.py in async coordinator mode +# (--use-coordinator and --async-mode). Goldens copied from the legacy case; +# async path must produce numerically identical output to the sync path. +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: frozen-start +MODE: inference +MODEL_ARGS: + --tiktoken-pattern: v2 + --use-mcore-models: true + --tokenizer-type: TikTokenizer + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --auto-detect-ckpt-format: true + --max-tokens-to-oom: 3600000 + --inference-max-seq-length: 4096 + --attention-backend: flash + --use-checkpoint-args: true + --micro-batch-size: 1 + --no-load-optim: true + --no-use-tokenizer-model-from-checkpoint-args: true + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ + --distributed-backend: nccl + --log-interval: 1 + --transformer-impl: inference_optimized + --sequence-parallel: true + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 + --deterministic-mode: true + --ckpt-format: torch_dist + --bf16: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --num-layers: 24 + --hidden-size: 1152 + --num-attention-heads: 16 + --max-position-embeddings: 1024 + --seq-length: 1024 + --temperature: 1.0 + --top_k: 1 + --return-log-probs: true + --num-tokens-to-generate: 30 + --inference-dynamic-batching-buffer-size-gb: 20 + --dist-ckpt-strictness: log_unexpected + --inference-ckpt-non-strict: true # To handle the extra_state errors + --output-path: ${INFERENCE_OUTPUT_PATH} + --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." + --incoming-requests-per-step: 32 + --use-flashinfer-fused-rope: true + --inference-logging-step-interval: 1 + --use-coordinator: true + --async-mode: true +METRICS: + - "generated_tokens" + - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..dd8b08e446f --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_583m_logitsmatch/golden_values_dev_dgx_h100.json @@ -0,0 +1,2699 @@ +{ + "0": { + "input_prompt": "The $500 Cup of coffee?\nConsider this, most Americans spend an average of $1,500-2,000 a year on this bean water.\nI have a few question for you: \nHow has business been the past few months?\nDo you ever feel like your business is stuck?\nDon't feel like you're able to improve performance and make changes required to achieve success ?\nAre your customers spneding less and less and wanting more?\nHave the gas prices affected your business?\nDo you have employees and do they hate you or wish they could quit?\n\nNow, before you and I can decide wheter or not I will be a good fit for your business we should talk this over with coffee.\nAnd, just to warn you this isn't some casual thing. This is not a date or time to be personal or social (but by all means share what you will coz I'll gladly listen).\nTher eare two major talking points and stratagies we will focios on in our lil coffee social\nFor one, we will find your unique selling Proposition (USP).\nDo have the best price? Are you the cheapest in town? Are your customers jerks? Do you haVE REGULARS? Why do people come back?\nwe'll also look for the holes in your business bucket. I'm willing to bet there's a hole or two in your business we'll find together that'll make this 500 dollar cup of Joe pay for itse;f immedietly.\nMany find themselves to be more profitable by just finding out where the dollars are escaping in their business and I like to think of myself as a guy that comes along with some spakel or putty and patch those holes up for you.\nBeleive me, just fixing one hole can mean a lot...just think about a sinking boat that has a hole in it that's about 3\u201d in diameter... it doesn't take long to sink.\nI have no agenda, besides f=getting to know your business and seeing wher I can patch the holes and find what makes you do darn unique (I know this won't take long.)\nMany folks, I bet, will find what they need to get off their chest with a quick phone call and they just paypal me the money and make a coffee at home. Look, that's fine too.\nI just to get you ot of your comfort zone, because this is where it all starts my frind.\nSome smart GOAT entrepreneur will probably get everything they need just out of our lil mini consulatant for the more extensive business consukting I offer, and look, that's fine, too.\nMaybe this cup of coffee will be all you need to gtet back on your feet, but not only surive, but thrive!\nI'm not trying to make frineds, or make a bunch of money, or look down your shirt\nBut this is only going to be a 45 minute (max) coffee chat\nAnd, I know you (and me) both have a lot on our plates. So no messing around\nAfter our consultation I will follow up with you in a few days and see how things are going, then I will be emailing you about once or twice every two weeks, just to make sure you're staying on task and implementing what we went over.\nTghere is no obligation to go any further and will gladly give you back your money if this pre-consultation doesn't put you on the right path or you don't get any value out of it...", + "generated_text": " $ is a$ is a $ is a $ is a $ is a $ is a $$1, you\n$ $$ $\n$ $- $\n$\n$\n$ $$$\n$\n$\n$\n$\n$\n$\n$\n$???????, $\n-1\n$\n1.5.\n$\n$, you\n$.\n$\n1,1.1\nI\n$.\nI\n\n\nHow\n$,,,0,1,0,0.0\nIn\nThe\nThe\nThe\n", + "generated_tokens": [ + 1659, + 1395, + 1261, + 1036, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1395, + 1261, + 1659, + 1036, + 1049, + 1044, + 1636, + 1010, + 1036, + 1659, + 1036, + 1659, + 1010, + 1036, + 1659, + 1045, + 1659, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1659, + 1036, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1010, + 1036, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1063, + 1044, + 1659, + 1010, + 1045, + 1049, + 1010, + 1036, + 1010, + 1049, + 1046, + 1053, + 1046, + 1010, + 1036, + 1010, + 1036, + 1044, + 1636, + 1010, + 1036, + 1046, + 1010, + 1036, + 1010, + 1049, + 1044, + 1049, + 1046, + 1049, + 1010, + 1073, + 1010, + 1036, + 1046, + 1010, + 1073, + 1010, + 1010, + 1010, + 7801, + 1010, + 1036, + 1044, + 1044, + 1044, + 1048, + 1044, + 1049, + 1044, + 1048, + 1044, + 1048, + 1046, + 1048, + 1010, + 1785, + 1010, + 1784, + 1010, + 1784, + 1010, + 1784, + 1010 + ], + "latency": 9.61214280128479, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.7319135665893555, + -2.188307285308838, + -0.7547445297241211, + -0.7294313311576843, + -10.238386154174805, + -3.3775341510772705, + -6.394498825073242, + -7.354557037353516, + -9.018157958984375, + -3.012073040008545, + -3.2584073543548584, + -5.220732688903809, + -4.620487213134766, + -2.5078930854797363, + -3.752683162689209, + -0.13360372185707092, + -0.05705544352531433, + -0.41462242603302, + -1.585279941558838, + -1.6438164710998535, + -1.9557222127914429, + -0.3989897072315216, + -0.0365302674472332, + -6.368816375732422, + -0.8731719255447388, + -0.022585075348615646, + -0.2775891423225403, + -0.0027362785767763853, + -0.0006812873762100935, + -1.581446647644043, + -0.008688976056873798, + -0.3532317280769348, + -6.071163177490234, + -9.162371635437012, + -9.965556144714355, + -2.400461196899414, + -2.9898362159729004, + -2.9803032875061035, + -2.12601900100708, + -3.500912666320801, + -7.015069007873535, + -2.278961420059204, + -0.46380555629730225, + -4.078739166259766, + -1.9430254697799683, + -3.5642244815826416, + -3.689701795578003, + -6.201474189758301, + -6.580833911895752, + -2.3081111907958984, + -5.42717170715332, + -1.1886008977890015, + -1.172760248184204, + -1.3571951389312744, + -1.3551844358444214, + -3.376784324645996, + -0.05118789151310921, + -4.064360618591309, + -2.575554847717285, + -0.6994737386703491, + -2.56724214553833, + -2.1888976097106934, + -0.4816131591796875, + -4.070178985595703, + -2.0060782432556152, + -6.858033180236816, + -0.059200502932071686, + -3.214278221130371, + -0.9671833515167236, + -0.823198676109314, + -1.0130078792572021, + -4.595561981201172, + -0.012724989093840122, + -5.214311599731445, + -8.246870040893555, + -3.1476030349731445, + -3.299684524536133, + -4.218191146850586, + -7.318399429321289, + -0.8580498695373535, + -3.0894036293029785, + -1.886361002922058, + -7.217658996582031, + -3.271679639816284, + -3.9717154502868652, + -1.8835484981536865, + -10.034332275390625, + -11.382490158081055, + -5.417011260986328, + -7.505967140197754, + -2.33837890625, + -0.07904055714607239, + -3.294971227645874, + -7.813640594482422, + -1.7646901607513428, + -4.025320053100586, + -3.5977325439453125, + -4.390352249145508, + -9.147806167602539, + -0.5303041934967041, + -7.721246242523193, + -0.6311959028244019, + -0.8119025230407715, + -0.7227814197540283, + -1.8369406461715698, + -0.20933297276496887, + -1.5395950078964233, + -4.424448490142822, + -4.084965705871582, + -3.355497360229492, + -1.0475609302520752, + -6.479413986206055, + -0.7810530662536621, + -2.132437229156494, + -6.648703098297119, + -2.9522438049316406, + -1.2485712766647339, + -4.040503025054932, + -2.3415768146514893, + -5.358206748962402, + -1.6258506774902344, + -3.956300973892212, + -0.732298731803894, + -7.441117286682129, + -1.5242161750793457, + -2.4555861949920654, + -4.295163154602051, + -9.687600135803223, + -0.8213484883308411, + -1.2446978092193604, + -0.01942702941596508, + -4.619411468505859, + -3.3297007083892822, + -2.2139487266540527, + -3.691431999206543, + -2.6574106216430664, + -6.075929641723633, + -0.6123450994491577, + -1.2942559719085693, + -0.6262839436531067, + -7.398006439208984, + -4.4869890213012695, + -4.202048301696777, + -4.982994079589844, + -0.637227475643158, + -3.061023235321045, + -10.117584228515625, + -3.8567495346069336, + -4.0480828285217285, + -2.472019672393799, + -4.246374607086182, + -1.3939155340194702, + -7.132441520690918, + -0.20108745992183685, + -4.986658573150635, + -4.387957572937012, + -0.01108358334749937, + -4.209756851196289, + -7.271108627319336, + -4.047314643859863, + -2.6497321128845215, + -1.4763175249099731, + -0.28365400433540344, + -3.5247769355773926, + -1.4226995706558228, + -4.327237129211426, + -2.0407187938690186, + -6.1437907218933105, + -1.5190880298614502, + -2.5511486530303955, + -7.504094123840332, + -2.152172565460205, + -6.708334922790527, + -6.913146495819092, + -3.6959621906280518, + -6.752341270446777, + -0.63083815574646, + -0.12433214485645294, + -5.0525641441345215, + -4.435934066772461, + -0.45601028203964233, + -6.3459577560424805, + -9.882917404174805, + -3.1422882080078125, + -2.550520658493042, + -3.2099051475524902, + -6.278127193450928, + -0.07764133810997009, + -3.155696153640747, + -1.933587670326233, + -9.61027717590332, + -6.211391925811768, + -4.664543151855469, + -6.783782005310059, + -5.676271438598633, + -8.605900764465332, + -0.0824289619922638, + -3.5463995933532715, + -13.374168395996094, + -1.2401021718978882, + -1.8734056949615479, + -3.4154422283172607, + -1.6733763217926025, + -17.633970260620117, + -9.345113754272461, + -0.6277351975440979, + -2.9617538452148438, + -2.5565333366394043, + -10.10580825805664, + -7.130337715148926, + -7.36820125579834, + -4.098911285400391, + -5.747079372406006, + -2.945054769515991, + -0.7887389063835144, + -1.6583149433135986, + -1.0165244340896606, + -6.581666946411133, + -5.926386833190918, + -5.845194339752197, + -0.9657630920410156, + -7.868755340576172, + -1.3244551420211792, + -0.2657390236854553, + -0.06403665244579315, + -2.983020782470703, + -5.943899631500244, + -7.877285957336426, + -3.593116283416748, + -3.819509506225586, + -7.226177215576172, + -2.5206997394561768, + -3.385587215423584, + -0.37499159574508667, + -1.4698283672332764, + -3.1460342407226562, + -0.0077166082337498665, + -4.350916862487793, + -3.2183218002319336, + -0.6242184638977051, + -1.4782464504241943, + -2.8054311275482178, + -3.0831401348114014, + -12.17662525177002, + -2.113419532775879, + -1.6448111534118652, + -2.1834323406219482, + -0.7630388140678406, + -10.1896390914917, + -6.234405517578125, + -11.46288776397705, + -1.003785490989685, + -4.211658477783203, + -1.5010679960250854, + -5.859302043914795, + -2.0465080738067627, + -3.7468819618225098, + -4.684195518493652, + -4.318704128265381, + -2.7234389781951904, + -9.00437068939209, + -3.043811321258545, + -3.1384406089782715, + -2.713779926300049, + -2.095993995666504, + -2.1484954357147217, + -10.274479866027832, + -0.682350754737854, + -0.25973302125930786, + -3.6964316368103027, + -13.434456825256348, + -2.3368239402770996, + -5.382724761962891, + -1.9073458909988403, + -5.905669212341309, + -0.032165709882974625, + -1.6530004739761353, + -2.728893280029297, + -1.640552043914795, + -1.1391171216964722, + -1.4353511333465576, + -4.003787994384766, + -0.3450564742088318, + -0.7168521285057068, + -0.34650325775146484, + -0.3616408705711365, + -7.062709331512451, + -1.2851682901382446, + -2.299129009246826, + -8.800156593322754, + -5.208735466003418, + -4.780910491943359, + -2.78342342376709, + -4.469717979431152, + -6.909726619720459, + -2.5114197731018066, + -0.659822404384613, + -0.6915416121482849, + -3.2363741397857666, + -0.5283617377281189, + -0.10473938286304474, + -6.215325832366943, + -7.283237934112549, + -1.6797031164169312, + -11.50100040435791, + -7.5822978019714355, + -3.387317657470703, + -11.407575607299805, + -5.441976547241211, + -3.3264851570129395, + -0.7265786528587341, + -1.382750153541565, + -7.841699600219727, + -8.105277061462402, + -3.9569506645202637, + -4.963083267211914, + -0.5492897629737854, + -4.6081390380859375, + -5.870400905609131, + -3.957930088043213, + -5.275494575500488, + -4.105091094970703, + -2.15435528755188, + -2.8472700119018555, + -1.1278448104858398, + -8.226571083068848, + -0.40629008412361145, + -9.916461944580078, + -4.616743087768555, + -1.691868543624878, + -0.6639478802680969, + -2.5716753005981445, + -6.676954746246338, + -6.535329818725586, + -0.4170510768890381, + -1.443942904472351, + -3.145481824874878, + -1.440589427947998, + -0.26935356855392456, + -0.9647155404090881, + -4.335958957672119, + -1.5647850036621094, + -5.890466690063477, + -3.01654052734375, + -1.9168468713760376, + -3.7365682125091553, + -8.001864433288574, + -10.680083274841309, + -4.489352226257324, + -4.6058149337768555, + -7.69011116027832, + -3.6247005462646484, + -1.5600426197052002, + -10.2160062789917, + -5.004643440246582, + -0.19602319598197937, + -3.375545024871826, + -2.669325590133667, + -1.3932737112045288, + -1.6410658359527588, + -6.847603797912598, + -6.744344711303711, + -0.5215591192245483, + -0.25840020179748535, + -1.1448237895965576, + -5.57253885269165, + -7.251138687133789, + -4.221924781799316, + -0.7688062787055969, + -2.504502534866333, + -3.146519660949707, + -2.206653356552124, + -1.4295082092285156, + -7.96943998336792, + -4.332189083099365, + -2.5750505924224854, + -1.7102608680725098, + -5.311381816864014, + -8.897522926330566, + -2.994919538497925, + -3.3397974967956543, + -2.1794328689575195, + -2.437566041946411, + -0.3181810975074768, + -0.27412793040275574, + -0.7914466857910156, + -2.3470635414123535, + -2.4099245071411133, + -2.491870880126953, + -3.024170160293579, + -1.9719040393829346, + -11.373910903930664, + -1.4279751777648926, + -0.14573107659816742, + -2.055763006210327, + -6.366893291473389, + -4.24091911315918, + -0.00709194503724575, + -2.0199716091156006, + -2.524750232696533, + -1.4272525310516357, + -0.5185190439224243, + -2.927150011062622, + -2.7070627212524414, + -3.365638017654419, + -4.318085193634033, + -7.773144721984863, + -1.7947180271148682, + -7.657534599304199, + -8.767786026000977, + -14.74280071258545, + -1.8042558431625366, + -3.2712037563323975, + -1.4002125263214111, + -4.887944221496582, + -1.4821010828018188, + -1.5255622863769531, + -5.879070281982422, + -4.463839530944824, + -5.1955976486206055, + -5.665647506713867, + -0.3775045573711395, + -5.9350481033325195, + -2.800539255142212, + -0.13162286579608917, + -3.034379720687866, + -4.729524612426758, + -4.6252641677856445, + -3.850942611694336, + -2.4760568141937256, + -6.059760093688965, + -10.12075138092041, + -0.9469369649887085, + -11.595907211303711, + -6.875324726104736, + -4.268826007843018, + -2.835529088973999, + -3.8626279830932617, + -4.876199245452881, + -0.013071090914309025, + -4.964417934417725, + -0.7445687055587769, + -5.707155227661133, + -6.10660457611084, + -4.317755699157715, + -4.440443992614746, + -2.9202542304992676, + -4.743522644042969, + -1.2569392919540405, + -2.8675737380981445, + -2.3151841163635254, + -4.318130970001221, + -1.9054772853851318, + -1.1808521747589111, + -0.765956461429596, + -2.768916606903076, + -6.237791061401367, + -1.7224305868148804, + -7.137521743774414, + -4.512486457824707, + -1.9069950580596924, + -4.145983695983887, + -5.365190505981445, + -0.059828490018844604, + -2.273892879486084, + -3.4013004302978516, + -5.035730361938477, + -6.501443386077881, + -9.903446197509766, + -1.6332892179489136, + -2.1572084426879883, + -1.6149548292160034, + -1.4698481559753418, + -6.01010799407959, + -2.2243528366088867, + -6.900836944580078, + -6.0930986404418945, + -2.974020481109619, + -3.225423574447632, + -8.423272132873535, + -1.3423724174499512, + -3.626147508621216, + -0.4862469434738159, + -6.860866546630859, + -3.8910953998565674, + -2.33319354057312, + -1.7229185104370117, + -2.215972423553467, + -8.99046516418457, + -4.099084854125977, + -2.4191012382507324, + -8.288970947265625, + -2.9641928672790527, + -1.5036451816558838, + -3.0544614791870117, + -0.0715634673833847, + -2.444031238555908, + -4.520998954772949, + -3.972568988800049, + -0.4985870122909546, + -2.1651363372802734, + -3.4427435398101807, + -1.730639100074768, + -0.9458961486816406, + -7.740211009979248, + -9.39163875579834, + -3.895984172821045, + -1.7523534297943115, + -5.41331672668457, + -8.910720825195312, + -12.971094131469727, + -3.0455880165100098, + -10.501265525817871, + -3.3864927291870117, + -4.842309951782227, + -3.9964733123779297, + -7.3046793937683105, + -2.6607093811035156, + -1.3541781902313232, + -5.003270626068115, + -3.944551944732666, + -0.11356143653392792, + -5.174440383911133, + -9.628616333007812, + -8.654989242553711, + -8.980416297912598, + -6.670101642608643, + -5.488286018371582, + -5.943419933319092, + -2.126483201980591, + -8.054739952087402, + -7.458671569824219, + -2.5267202854156494, + -6.455472946166992, + -8.655346870422363, + -7.903901100158691, + -6.221062660217285, + -7.129237174987793, + -4.2345380783081055, + -2.5375306606292725, + -7.697700500488281, + -1.567080020904541, + -2.084331750869751, + -0.25020831823349, + -1.5145041942596436, + -4.619244575500488, + -0.2970108985900879, + -0.4977554678916931, + -6.197869300842285, + -4.030620098114014, + -7.232107639312744, + -0.21076253056526184, + -1.563366174697876, + -1.133756160736084, + -2.708237648010254, + -4.080535888671875, + -0.6818401217460632, + -0.1864331066608429, + -0.49012088775634766, + -8.732468605041504, + -11.945040702819824, + -5.243098735809326, + -1.5294703245162964, + -0.8935543298721313, + -0.6174070835113525, + -1.5068217515945435, + -3.5766501426696777, + -5.393096923828125, + -4.202867031097412, + -14.765748023986816, + -5.2513813972473145, + -0.7597705721855164, + -0.2502063810825348, + -1.7403976917266846, + -2.8000779151916504, + -1.9808133840560913, + -2.1654744148254395, + -1.8629226684570312, + -3.222038745880127, + -0.040942225605249405, + -2.3384013175964355, + -10.210381507873535, + -4.5859761238098145, + -0.5805734395980835, + -3.7019288539886475, + -2.001936674118042, + -2.7876083850860596, + -2.9799084663391113, + -4.349887371063232, + -0.0792960673570633, + -1.4366114139556885, + -1.0813264846801758, + -1.3510822057724, + -6.7060699462890625, + -5.436615943908691, + -3.978389263153076, + -6.785447597503662, + -6.147171497344971, + -3.97414231300354, + -4.332991600036621, + -0.9269428253173828, + -5.1237101554870605, + -4.486598968505859, + -0.04678357392549515, + -1.0307552814483643, + -1.4249452352523804, + -4.517682075500488, + -3.561821699142456, + -2.0815205574035645, + -0.6041194200515747, + -5.992964744567871, + -7.092092514038086, + -0.48916709423065186, + -2.6405677795410156, + -4.3345723152160645, + -3.533582925796509, + -3.1233346462249756, + -3.107872486114502, + -1.9901115894317627, + -3.1052846908569336, + -1.8440347909927368, + -6.21368408203125, + -1.8796799182891846, + -2.705214738845825, + -0.2987763583660126, + -4.070865154266357, + -1.6675832271575928, + -1.3896636962890625, + -1.5731089115142822, + -3.526170015335083, + -2.5088443756103516, + -1.208929419517517, + -3.673125743865967, + -2.501532554626465, + -6.875064373016357, + -8.512459754943848, + -1.042314052581787, + -3.657850980758667, + -7.0950798988342285, + -4.974049091339111, + -8.14085578918457, + -3.529888153076172, + -1.9389504194259644, + -7.0902204513549805, + -2.409292459487915, + -2.9428021907806396, + -1.688283085823059, + -3.622368335723877, + -2.0903351306915283, + -4.160663604736328, + -3.1683764457702637, + -1.2135626077651978, + -7.566033363342285, + -3.1186251640319824, + -5.899919509887695, + -0.9518840312957764, + -2.656729221343994, + -2.2994377613067627, + -6.806836128234863, + -1.280236840248108, + -2.838846206665039, + -1.3598848581314087, + -11.707776069641113, + -3.134333372116089, + -0.6230669617652893, + -8.219222068786621, + -7.562507152557373, + -7.489459037780762, + -1.5368008613586426, + -7.149652481079102, + -5.749268054962158, + -3.162869691848755, + -2.7235195636749268, + -6.128931999206543, + -1.1934199333190918, + -3.986410617828369, + -3.76609468460083, + -1.712721586227417, + -3.195504903793335, + -8.397743225097656, + -3.1260581016540527, + -9.792022705078125, + -4.217884540557861, + -11.583260536193848, + -5.987588882446289, + -5.178754806518555, + -6.994749069213867, + -5.167606353759766, + -7.124668121337891, + -6.201416015625, + -10.203682899475098, + -6.858526229858398, + -2.733592987060547, + -5.078882217407227, + -9.003358840942383, + -4.704894542694092, + -3.9085562229156494, + -7.247268199920654, + -7.091092109680176, + -4.4150166511535645, + -7.56699275970459, + -9.485116004943848, + -1.9977033138275146, + -6.65272331237793, + -2.236643075942993, + -7.518955707550049, + -5.525973320007324, + -4.67877721786499, + -6.608670234680176, + -5.536133766174316, + -10.772479057312012, + -10.8853178024292, + -3.6156129837036133, + -6.751470565795898, + -6.4537434577941895, + -3.4220399856567383, + -8.251005172729492, + -3.2146153450012207, + -6.330069541931152, + -1.5551663637161255, + -6.520583629608154, + -10.450878143310547, + -5.8788957595825195, + -3.7398200035095215, + -3.9084208011627197, + -0.3640081584453583, + -6.961522102355957, + -6.066243648529053, + -7.270624160766602, + -5.098455429077148, + -2.7642822265625, + -5.460171699523926, + -7.362828731536865, + -2.558631658554077, + -2.186410427093506, + -2.5309929847717285, + -2.46756649017334, + -2.0306026935577393, + -1.8713470697402954, + -2.108008623123169, + -1.2698389291763306, + -2.1712756156921387, + -2.4432802200317383, + -1.1477653980255127, + -1.8417484760284424, + -2.5971946716308594, + -1.8250831365585327, + -2.103092670440674, + -2.5183165073394775, + -2.9367291927337646, + -1.9412965774536133, + -1.7692793607711792, + -2.864521026611328, + -3.1332175731658936, + -1.098311185836792, + -2.946441173553467, + -2.2800471782684326, + -3.1929852962493896, + -2.754260778427124, + -3.485616445541382, + -3.3010287284851074, + -2.5537776947021484, + -2.6752865314483643, + -3.1617612838745117, + -2.4571690559387207, + -2.060081958770752, + -2.425969362258911, + -2.212725877761841, + -2.4232254028320312, + -3.0587053298950195, + -2.4074010848999023, + -2.457937479019165, + -2.319617986679077, + -2.6340954303741455, + -2.599524736404419, + -2.5302212238311768, + -1.6849274635314941, + -2.2609786987304688, + -2.039928674697876, + -1.9474098682403564, + -2.3550753593444824, + -1.718749761581421, + -2.413884162902832, + -1.6247628927230835, + -2.4784040451049805, + -1.828325629234314, + -1.3880831003189087, + -1.4448199272155762, + -1.1477117538452148, + -1.1669728755950928, + -1.8787822723388672, + -1.5565840005874634, + -1.6666553020477295, + -1.747725248336792, + -1.959598422050476, + -2.0376486778259277, + -2.345367431640625, + -2.055098533630371, + -1.3940613269805908, + -3.4385242462158203, + -2.7489635944366455, + -3.2590157985687256, + -3.1128957271575928, + -1.7070379257202148, + -3.9010369777679443, + -3.21574068069458, + -3.3850393295288086, + -1.8778185844421387, + -2.698211908340454, + -1.8060741424560547, + -2.0845324993133545, + -3.4797585010528564, + -2.263254404067993, + -3.083108901977539, + -1.6589758396148682, + -2.687279224395752, + -1.77505624294281, + -2.6142921447753906, + -1.934045672416687, + -1.8834377527236938, + -2.8038980960845947, + -1.550542950630188, + -3.4054152965545654, + -1.724036693572998, + -2.3146564960479736, + -1.5134503841400146, + -2.9289023876190186, + -1.5285141468048096, + -3.421035051345825, + -1.3757282495498657, + -3.441431760787964, + -1.5286564826965332, + -3.4372904300689697, + -3.173043966293335, + -1.1313854455947876, + -1.486415147781372, + -2.506413221359253, + -3.5165903568267822, + -1.4112176895141602, + -3.7175354957580566, + -2.2170844078063965, + -0.704839289188385, + -0.6626103520393372, + -2.5483946800231934, + -0.949668288230896, + -3.1339564323425293, + -1.3326977491378784, + -3.5493476390838623, + -0.6785370111465454, + -3.236161470413208, + -2.1347084045410156, + -3.802447557449341, + -2.585503339767456, + -3.9361765384674072, + -2.290905714035034, + -3.714280605316162, + -1.793616533279419, + -3.7252492904663086, + -1.450188159942627, + -3.11938738822937, + -1.25174880027771 + ] + }, + "32": { + "input_prompt": "create a conversational article", + "generated_text": " about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article. The article should be about the topic of the article.", + "generated_tokens": [ + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046, + 1531, + 9369, + 2715, + 1402, + 2314, + 1278, + 17915, + 1307, + 1278, + 9369, + 1046 + ], + "latency": 47.09983468055725, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.4165568351745605, + -11.358176231384277, + -0.0701780766248703, + -7.797665119171143, + -2.6805992126464844, + -1.4707680940628052, + -3.0390255451202393, + -1.6902849674224854, + -1.270594835281372, + -1.1936196088790894, + -1.2523558139801025, + -2.7270259857177734, + -1.2371309995651245, + -0.9618493318557739, + -0.4379909038543701, + -1.3917063474655151, + -1.1055524349212646, + -0.9122569561004639, + -0.9911308288574219, + -0.08436793833971024, + -0.5424078106880188, + -0.9181017279624939, + -0.5873759388923645, + -0.19014373421669006, + -0.06655456870794296, + -0.15252672135829926, + -0.09415211528539658, + -0.009787309914827347, + -0.013910251669585705, + -0.005296128336340189, + -0.005677408073097467, + -0.02013739012181759, + -0.21594694256782532, + -0.07153760641813278, + -0.0066444179974496365, + -0.010198505595326424, + -0.011980246752500534, + -0.003686776151880622, + -0.0037619550712406635, + -0.0022467151284217834, + -0.004088377580046654, + -0.021828632801771164, + -0.0012669878778979182, + -0.09768074005842209, + -0.02652405947446823, + -0.0019286142196506262, + -0.002283824374899268, + -0.0032225127797573805, + -0.0009741804678924382, + -0.0009415484382770956, + -0.001211624126881361, + -0.001135300612077117, + -0.002340436913073063, + -0.0010846928926184773, + -0.0509282611310482, + -0.03832047060132027, + -0.00257422705180943, + -0.0022806129418313503, + -0.00262785074301064, + -0.0008195855189114809, + -0.0010239601833745837, + -0.0013777059502899647, + -0.0009899006690829992, + -0.0018756669014692307, + -0.0015304292319342494, + -0.08506463468074799, + -0.01893703266978264, + -0.0013797297142446041, + -0.0014461545506492257, + -0.0013971101725474, + -0.0005869334563612938, + -0.0005212855176068842, + -0.000876757490914315, + -0.0005256939912214875, + -0.0012863941956311464, + -0.0015691122971475124, + -0.051276568323373795, + -0.00973513163626194, + -0.0010469438275322318, + -0.0011531615164130926, + -0.0009969270322471857, + -0.00038342276820912957, + -0.0004032037395518273, + -0.000730247818864882, + -0.0003275334893260151, + -0.0008700875914655626, + -0.0017572689102962613, + -0.044393111020326614, + -0.013102858327329159, + -0.0011463745031505823, + -0.0012070996453985572, + -0.0012325793504714966, + -0.0005048430757597089, + -0.0004876854654867202, + -0.0007901645149104297, + -0.00041500062798149884, + -0.0009869233472272754, + -0.0018687656847760081, + -0.03943866863846779, + -0.014425630681216717, + -0.0014756753807887435, + -0.001423775334842503, + -0.001209719106554985, + -0.0005046047735959291, + -0.00042167355422861874, + -0.0007688426994718611, + -0.0002699726028367877, + -0.0006598440813831985, + -0.0017849955474957824, + -0.038999658077955246, + -0.012665312737226486, + -0.0014427024871110916, + -0.0014492495683953166, + -0.001016576774418354, + -0.00042083943844772875, + -0.00033241944038309157, + -0.0006403064471669495, + -0.00022373080719262362, + -0.0007053509471006691, + -0.0018597226589918137, + -0.030997740104794502, + -0.011259939521551132, + -0.0012655591126531363, + -0.00134151556994766, + -0.0008106521563604474, + -0.0003513672563713044, + -0.0002964295563288033, + -0.0006368515896610916, + -0.00020180096908006817, + -0.0005779979983344674, + -0.0016014858847483993, + -0.0271126888692379 + ] + }, + "64": { + "input_prompt": "App Concept: \"Eggy's Interactive Adventure World\"\n\nEggy's Interactive Adventure World is a cutting-edge app designed for phones and tablets that combines immersive storytelling with engaging, interactive experiences for both parents and children. This innovative app allows users to explore the world of Eggy and participate in a series of adventures that teach essential life skills, foster personal growth, and strengthen parent-child bonds.\n\nKey Features:\n\n1. Interactive Stories: The app brings the 20 Eggy stories to life through interactive, animated storytelling. Users can read, listen, and actively participate in the stories, making choices that guide Eggy through various adventures, each with unique lessons and experiences.\n2. Augmented Reality (AR) Integration: The app utilizes AR technology, allowing users to interact with Eggy and the story characters in their real-world environment. This immersive experience creates a deeper connection with the stories and encourages engagement.\n3. Personalized Adventures: The app customizes each story based on the user's interests, needs, and personal growth areas. By collecting user input, the app tailors the storylines and learning outcomes to ensure that each adventure resonates with the child's unique experiences.\n4. Parent-Child Cooperative Play: The app encourages parents and children to collaborate and engage in the stories together. Through a split-screen mode or multi-device connectivity, parents and children can actively participate in the stories and learn essential life skills together.\n5. Skill-Building Mini-Games: Interspersed throughout the stories are educational mini-games that reinforce the lessons and themes. These games are designed to be fun, engaging, and accessible to children, encouraging them to practice and develop various skills.\n6. Progress Tracking and Rewards: The app tracks the user's progress through the stories and rewards them with badges, trophies, and other collectibles. This gamification aspect encourages continued engagement and fosters a sense of accomplishment and growth.\n7. Community Features: The app provides a safe, moderated online community where parents can share their experiences, discuss the stories and lessons, and connect with other families who are also using the app. This community fosters support, inspiration, and learning.\n8. Expert Resources: The app includes access to resources from experts in child development, psychology, and education, providing parents with additional tools and guidance on fostering the growth and well-being of their children.\n\nEggy's Interactive Adventure World is a one-of-a-kind, mind-blowing app that revolutionizes the way parents and children engage with stories and learn essential life skills. Combining the power of storytelling, cutting-edge technology, and expert resources, this app is a must-have for families looking to strengthen their bonds and support their children's personal growth.", + "generated_text": "The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season of the National Hockey League (NHL). The 1999\u20132000 season was the 10th season was the 10th season was the 10 season was the 10 season was the 10", + "generated_tokens": [ + 2, + 1784, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1307, + 1278, + 5805, + 46982, + 8525, + 1319, + 19644, + 1076, + 1577, + 1531, + 1032, + 1049, + 1057, + 1057, + 1057, + 1882, + 1050, + 1048, + 1048, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 1411, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048, + 5526, + 1486, + 1278, + 1032, + 1049, + 1048 + ], + "latency": 85.16301679611206, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -12.107745170593262, + -2.9727728366851807, + -3.720092535018921, + -5.592433929443359, + -10.964235305786133, + -3.654498338699341, + -9.33439826965332, + -4.833785057067871, + -5.187321662902832, + -2.6944785118103027, + -6.9262237548828125, + -0.654232919216156, + -0.5550781488418579, + -0.21346639096736908, + -0.0134271876886487, + -0.010840100236237049, + -1.3878544569015503, + -0.6296291351318359, + -7.9766011238098145, + -0.4393192231655121, + -5.639142036437988, + -3.277270793914795, + -1.0206468105316162, + -11.703084945678711, + -0.7100943922996521, + -0.2809169888496399, + -2.771284818649292, + -7.190817832946777, + -4.048691749572754, + -0.012056218460202217, + -3.3802318572998047, + -0.6807184815406799, + -3.4844107627868652, + -3.312331199645996, + -0.5001641511917114, + -2.61255145072937, + -4.243694305419922, + -4.333778381347656, + -6.0625810623168945, + -0.011777156963944435, + -0.37577226758003235, + -0.9490834474563599, + -3.5450198650360107, + -2.1778035163879395, + -0.45957911014556885, + -3.00771164894104, + -1.7600425481796265, + -0.09766030311584473, + -2.467618942260742, + -1.329679012298584, + -0.8384320735931396, + -1.1864604949951172, + -3.628342866897583, + -0.2470003068447113, + -1.8938640356063843, + -5.168431282043457, + -0.05005566030740738, + -2.258014678955078, + -2.449028968811035, + -0.0034086955711245537, + -3.9485883712768555, + -1.6201664209365845, + -5.139942646026611, + -4.859354496002197, + -0.23686674237251282, + -0.5541543364524841, + -2.5826025009155273, + -6.114635467529297, + -4.3380208015441895, + -0.7412900924682617, + -0.3221715986728668, + -0.13805493712425232, + -4.1797332763671875, + -7.3456268310546875, + -0.13762745261192322, + -2.0905232429504395, + -1.0178627967834473, + -4.108260631561279, + -0.6007124185562134, + -1.0410642623901367, + -4.122039794921875, + -0.35905471444129944, + -1.4274661540985107, + -4.139932155609131, + -0.4237431585788727, + -1.6294409036636353, + -0.9811424016952515, + -4.132790565490723, + -1.1318120956420898, + -6.8258256912231445, + -1.5455098152160645, + -0.6984409093856812, + -13.664215087890625, + -0.1166313961148262, + -1.6347849369049072, + -0.28875046968460083, + -0.03130083531141281, + -1.5293006896972656, + -1.6488375663757324, + -4.224111557006836, + -4.760683059692383, + -1.9758747816085815, + -1.5828256607055664, + -2.8463857173919678, + -0.2620386481285095, + -1.7243889570236206, + -1.7945923805236816, + -0.8884308338165283, + -0.3766394555568695, + -0.34033581614494324, + -9.05566692352295, + -0.22754782438278198, + -0.033802058547735214, + -0.34108465909957886, + -0.5644669532775879, + -2.0925779342651367, + -4.547505855560303, + -10.870464324951172, + -1.1072022914886475, + -5.503787994384766, + -3.259672164916992, + -0.007964519783854485, + -3.0111639499664307, + -4.246737480163574, + -0.7813188433647156, + -3.331031322479248, + -4.485962867736816, + -0.9492117166519165, + -2.6757047176361084, + -1.1591349840164185, + -1.122117519378662, + -2.629878044128418, + -5.986321926116943, + -0.2146703153848648, + -0.002392764901742339, + -7.372479438781738, + -0.007077385671436787, + -0.06599216908216476, + -0.0970711037516594, + -3.2874932289123535, + -0.0019583588000386953, + -0.9122000336647034, + -4.930907249450684, + -0.019508399069309235, + -0.308611661195755, + -0.07778516411781311, + -3.8497893810272217, + -0.46124517917633057, + -0.38821348547935486, + -2.668412208557129, + -1.845987319946289, + -0.06470083445310593, + -0.006619549356400967, + -1.2610487937927246, + -0.13015533983707428, + -3.365312099456787, + -0.0014690094394609332, + -1.6789823770523071, + -1.2499005794525146, + -3.3992111682891846, + -5.563300132751465, + -0.823418140411377, + -4.24124813079834, + -1.6597849130630493, + -0.6941139698028564, + -1.5637556314468384, + -0.5482053756713867, + -0.9507225751876831, + -3.764758586883545, + -0.0006518622976727784, + -0.7540555000305176, + -5.058262825012207, + -0.3302401602268219, + -2.8130555152893066, + -0.17079885303974152, + -2.871047019958496, + -0.3991694450378418, + -3.1476998329162598, + -0.3488404452800751, + -2.0545666217803955, + -4.201597690582275, + -5.164614677429199, + -0.0271432027220726, + -0.0009785869624465704, + -3.3444161415100098, + -1.3117046356201172, + -6.375423431396484, + -0.05535568296909332, + -0.3919340968132019, + -0.060594215989112854, + -6.507473468780518, + -0.0023910999298095703, + -2.143423318862915, + -3.335618257522583, + -2.953970432281494, + -0.0013383012264966965, + -0.8080525398254395, + -0.29526084661483765, + -0.04036511853337288, + -3.231475353240967, + -1.0585589408874512, + -6.136373043060303, + -0.006182829383760691, + -0.035548023879528046, + -5.509808540344238, + -1.8490750789642334, + -9.83314037322998, + -0.07037576287984848, + -3.1621387004852295, + -6.762360095977783, + -1.3490527868270874, + -3.601043462753296, + -1.176393985748291, + -0.4342959523200989, + -0.06266004592180252, + -5.464046001434326, + -0.017946599051356316, + -1.0416009426116943, + -1.6117159128189087, + -12.289417266845703, + -1.5004339218139648, + -5.76563835144043, + -4.038386821746826, + -0.20812086760997772, + -3.6306562423706055, + -1.3901070356369019, + -1.087137222290039, + -2.423213243484497, + -4.503086090087891, + -0.0008031480247154832, + -0.03627370297908783, + -0.1653430461883545, + -7.958648681640625, + -1.1018548011779785, + -1.290948748588562, + -3.8049263954162598, + -1.8253734111785889, + -0.059022851288318634, + -0.0013984196120873094, + -4.698851585388184, + -2.5421664714813232, + -0.024493809789419174, + -4.828659534454346, + -3.0295286178588867, + -3.550312042236328, + -0.1185273677110672, + -0.22595760226249695, + -0.10782183706760406, + -1.4033282995224, + -0.4485701024532318, + -0.2889708876609802, + -0.05471855774521828, + -0.007632025051862001, + -2.1156554222106934, + -0.6249589323997498, + -4.198577404022217, + -0.14178156852722168, + -4.284021377563477, + -2.227515935897827, + -3.5022120475769043, + -0.19575819373130798, + -15.964509963989258, + -4.055960655212402, + -11.125024795532227, + -0.7681724429130554, + -3.0436902046203613, + -7.030262470245361, + -4.376729488372803, + -5.476145267486572, + -0.4219042658805847, + -3.7689766883850098, + -0.060010604560375214, + -0.8134393692016602, + -0.11386934667825699, + -0.025473715737462044, + -0.09736856073141098, + -4.357361793518066, + -0.3670865297317505, + -0.08063744008541107, + -0.1311480849981308, + -1.0903867483139038, + -1.2705107927322388, + -1.5076212882995605, + -4.295275688171387, + -0.04185756668448448, + -0.19810955226421356, + -1.9645220041275024, + -0.9597910642623901, + -0.13429655134677887, + -0.002283110748976469, + -7.066074371337891, + -3.639211654663086, + -1.0263917446136475, + -8.124760627746582, + -1.132537841796875, + -0.09160765260457993, + -0.08996370434761047, + -10.165366172790527, + -3.501585006713867, + -0.0019847711082547903, + -0.05309417471289635, + -0.31209683418273926, + -0.15089339017868042, + -1.23564875125885, + -1.2685208320617676, + -7.832758903503418, + -0.19271136820316315, + -0.014305183663964272, + -0.0007532381569035351, + -0.44688940048217773, + -2.6239724159240723, + -1.738666296005249, + -1.6480977535247803, + -0.46753185987472534, + -8.656959533691406, + -3.79868483543396, + -0.9281394481658936, + -2.2381181716918945, + -1.7654449939727783, + -0.4948798418045044, + -0.025028761476278305, + -1.5435361862182617, + -1.6390818357467651, + -1.4962153434753418, + -0.3425217270851135, + -0.013077914714813232, + -0.038474079221487045, + -5.3364362716674805, + -0.42365288734436035, + -1.884093999862671, + -3.510357618331909, + -6.198029518127441, + -0.44375038146972656, + -0.0008789013954810798, + -3.6025230884552, + -1.419615626335144, + -2.6723289489746094, + -5.775190830230713, + -1.1380761861801147, + -2.6683366298675537, + -0.43395891785621643, + -0.003145867260172963, + -8.63144302368164, + -1.646262764930725, + -1.732487678527832, + -4.561546802520752, + -0.5277953147888184, + -0.07333153486251831, + -0.5624169707298279, + -0.12201295047998428, + -2.6561455726623535, + -1.1071691513061523, + -2.6895060539245605, + -0.040864069014787674, + -0.04126371443271637, + -1.8294739723205566, + -0.09022177755832672, + -0.3154001832008362, + -0.46215569972991943, + -2.2462844848632812, + -0.30149081349372864, + -0.52588951587677, + -8.288043975830078, + -0.0002057340752799064, + -0.8021711707115173, + -4.4546098709106445, + -0.0001565095444675535, + -0.0015961299650371075, + -0.15216240286827087, + -0.3677564561367035, + -5.018707275390625, + -0.7850045561790466, + -1.9582659006118774, + -1.0046892166137695, + -10.0401029586792, + -0.16878114640712738, + -5.944240570068359, + -1.5523078441619873, + -5.7253522872924805, + -0.47948503494262695, + -0.44009655714035034, + -5.671053886413574, + -0.003280022880062461, + -0.7937742471694946, + -0.9639376401901245, + -0.00030048147891648114, + -1.0747740268707275, + -0.8839919567108154, + -3.416811466217041, + -1.6602673530578613, + -0.2706959843635559, + -0.0024333172477781773, + -4.478696823120117, + -6.20179557800293, + -0.11359559744596481, + -0.202009916305542, + -0.022310219705104828, + -2.367263078689575, + -1.0405994653701782, + -5.984308242797852, + -2.105138063430786, + -9.583202362060547, + -0.0004957877099514008, + -3.0655455589294434, + -0.0669412910938263, + -0.8977450728416443, + -2.2271294593811035, + -2.6617536544799805, + -1.8184051513671875, + -0.8291114568710327, + -0.4864235818386078, + -0.7993525862693787, + -3.51106858253479, + -2.1530935764312744, + -0.257144957780838, + -1.3934082984924316, + -1.3137131929397583, + -0.3384077548980713, + -0.1697217971086502, + -2.353395938873291, + -0.03406282886862755, + -0.39059701561927795, + -3.422821044921875, + -1.7117210626602173, + -0.7018465399742126, + -1.5995906591415405, + -3.6218395233154297, + -0.12497704476118088, + -0.16966234147548676, + -0.7313685417175293, + -0.4956285357475281, + -1.0840849876403809, + -5.042126655578613, + -0.00031704644788987935, + -7.683258056640625, + -0.9210801720619202, + -4.687852382659912, + -0.0028814247343689203, + -0.043382611125707626, + -4.1948652267456055, + -2.66593337059021, + -0.06153333932161331, + -0.0023110604379326105, + -6.729236602783203, + -5.777127742767334, + -0.08932067453861237, + -0.09890018403530121, + -0.009886111132800579, + -3.1145148277282715, + -3.725565195083618, + -0.0021998509764671326, + -3.9927196502685547, + -2.753793239593506, + -1.6037236452102661, + -0.17461130023002625, + -4.804804801940918, + -0.2311229705810547, + -0.30256444215774536, + -2.235363006591797, + -0.006614102050662041, + -0.34757524728775024, + -1.4946835041046143, + -1.222062587738037, + -3.658839225769043, + -1.356170892715454, + -0.5371109843254089, + -3.7580835819244385, + -4.54621696472168, + -0.31577637791633606, + -3.677156925201416, + -2.7181396484375, + -7.4674882888793945, + -0.00019369633810129017, + -2.3798398971557617, + -2.5452184677124023, + -0.2858496308326721, + -4.315659523010254, + -0.025835415348410606, + -0.000603493710514158, + -0.2546294331550598, + -0.12032663822174072, + -2.006908655166626, + -5.990736961364746, + -7.146596908569336, + -0.23356498777866364, + -0.2201036810874939, + -0.01235415879637003, + -0.011248741298913956, + -1.4155778884887695, + -0.40242519974708557, + -5.877886772155762, + -0.7865053415298462, + -0.03231288120150566, + -0.004864405374974012, + -0.0050629740580916405, + -2.7049152851104736, + -6.822089195251465, + -0.39252761006355286, + -1.2290617227554321, + -0.007630132604390383, + -3.485461711883545, + -0.47985684871673584, + -6.1813530921936035, + -0.03757825121283531, + -0.37834712862968445, + -0.22192610800266266, + -1.165318489074707, + -0.5220151543617249, + -0.1289423257112503, + -3.216222047805786, + -1.0787583589553833, + -3.0716826915740967, + -0.6023419499397278, + -2.558605194091797, + -0.927433431148529, + -0.00364841241389513, + -0.14910078048706055, + -0.7318926453590393, + -6.159773826599121, + -0.0015301911626011133, + -1.8908276557922363, + -1.9641315937042236, + -0.021651331335306168, + -2.1648828983306885, + -2.2700207233428955, + -7.833290100097656, + -0.03397307172417641, + -0.8344621658325195, + -0.02225659228861332, + -0.06639260798692703, + -2.3780317306518555, + -3.180129051208496, + -0.09030630439519882, + -2.4138312339782715, + -1.3445552587509155, + -1.848326325416565, + -0.9726964831352234, + -2.851792335510254, + -0.0630769282579422, + -0.0011394681641831994, + -0.05843213573098183, + -2.6616668701171875, + -1.575437068939209, + -0.180197611451149, + -5.552371501922607, + -0.26108410954475403, + -2.529611587524414, + -0.37780019640922546, + -5.141795635223389, + -0.5921107530593872, + -0.2474975287914276, + -0.10687454044818878, + -4.891775131225586, + -0.25011152029037476, + -2.4100728034973145, + -1.358667016029358, + -2.790961503982544, + -3.8654675483703613, + -1.0076243877410889, + -0.7456949949264526, + -1.5575554370880127, + -2.05328631401062, + -1.6538066864013672, + -0.0558217354118824, + -0.0001817776501411572, + -0.0011643542675301433, + -0.038359593600034714, + -1.4208931922912598, + -0.542127251625061, + -0.3162364959716797, + -0.3966117799282074, + -1.1765563488006592, + -1.7920958995819092, + -0.18425509333610535, + -0.1092008650302887, + -0.46676987409591675, + -0.24977745115756989, + -1.0375996828079224, + -0.5268858671188354, + -0.008942908607423306, + -0.6404479146003723, + -0.0033111530356109142, + -5.3165931603871286e-05, + -0.5154370665550232, + -0.39286962151527405, + -1.401839256286621, + -0.6232213973999023, + -0.02168831042945385, + -0.004282470792531967, + -0.005199837032705545, + -0.09748794883489609, + -0.040823787450790405, + -0.00014852374442853034, + -0.0005832401220686734, + -0.005303124897181988, + -0.6537013053894043, + -0.38026049733161926, + -0.04189129173755646, + -0.010385753586888313, + -0.008756335824728012, + -0.013362848199903965, + -0.000504723924677819, + -0.002797620603814721, + -0.0014512732159346342, + -0.0013321106089279056, + -0.010883613489568233, + -0.005159396678209305, + -0.004701037425547838, + -0.01591104455292225, + -0.001474246964789927, + -1.2278481335670222e-05, + -0.010548785328865051, + -0.08341525495052338, + -0.03858809545636177, + -0.056062061339616776, + -0.0009532198309898376, + -0.0005789510905742645, + -0.0008986725588329136, + -0.00710969977080822, + -0.0006561510381288826, + -1.4781842764932662e-05, + -5.578839045483619e-05, + -0.0006398299592547119, + -0.0028786908369511366, + -0.0034092895220965147, + -0.008268529549241066, + -0.006602259818464518, + -0.004517706111073494, + -0.02233586646616459, + -0.0006323245470412076, + -0.009195122867822647, + -0.0029284947086125612, + -0.004457537550479174, + -0.017873765900731087, + -0.008801711723208427, + -0.0036383166443556547, + -0.08078611642122269, + -0.006347495596855879, + -0.0002177716523874551, + -0.04688572511076927, + -0.2718890309333801, + -0.07996802777051926, + -0.04674842208623886, + -0.009984076954424381, + -0.010000954382121563, + -0.050126753747463226, + -0.5864179730415344, + -0.2915390133857727, + -0.008090462535619736, + -0.032190412282943726, + -0.03461571782827377, + -0.2785419523715973, + -0.05830562859773636, + -0.02893950417637825, + -0.12241066247224808, + -0.02711048536002636, + -0.16450686752796173, + -0.09852994978427887, + -0.2651047706604004, + -0.35559725761413574, + -0.12606258690357208, + -0.32793670892715454, + -0.20878805220127106, + -0.7587923407554626, + -0.4730657637119293, + -1.496794581413269, + -0.2486363798379898, + -0.20723387598991394, + -0.1872958242893219, + -0.19151091575622559, + -0.3350580036640167, + -1.3085839748382568, + -0.6109102964401245, + -0.2947172224521637, + -0.37130236625671387, + -0.19888469576835632, + -0.18297068774700165, + -0.9978674054145813, + -0.5471905469894409, + -0.4379975199699402, + -0.407988041639328, + -0.17325688898563385, + -0.16938896477222443 + ] + }, + "96": { + "input_prompt": "what is the concept of double materiality in sustainability?", + "generated_text": " What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double materiality in sustainability? What is the concept of double material", + "generated_tokens": [ + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318, + 1532, + 1294, + 60187, + 1063, + 5675, + 1395, + 1278, + 7401, + 1307, + 5558, + 4318 + ], + "latency": 123.03724575042725, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -4.917365074157715, + -0.9960631132125854, + -7.875392913818359, + -0.2993181347846985, + -7.760880470275879, + -10.308395385742188, + -2.1807961463928223, + -1.6412583589553833, + -9.521512985229492, + -1.627489447593689, + -1.8410861492156982, + -0.9285702705383301, + -0.2576955556869507, + -0.9641067981719971, + -0.02314644306898117, + -0.6696561574935913, + -0.07035009562969208, + -0.004622488282620907, + -0.025748632848262787, + -0.06276137381792068, + -0.17385317385196686, + -0.3285445272922516, + -0.0592009499669075, + -0.007940039038658142, + -0.22664028406143188, + -0.0017957051750272512, + -0.022929180413484573, + -0.005733947269618511, + -0.0012996093137189746, + -0.006419987417757511, + -0.02376849390566349, + -0.27800270915031433, + -0.4650723934173584, + -0.04936715215444565, + -0.003972141072154045, + -0.01477995328605175, + -0.0012044801842421293, + -0.014891182072460651, + -0.002709767082706094, + -0.0009939497103914618, + -0.0028436246793717146, + -0.006759870797395706, + -0.15416178107261658, + -0.20121537148952484, + -0.016414370387792587, + -0.0015769677702337503, + -0.008138825185596943, + -0.0007713441736996174, + -0.013819841668009758, + -0.003826678032055497, + -0.0005918181850574911, + -0.0014938872773200274, + -0.00485716899856925, + -0.081083282828331, + -0.09642580896615982, + -0.009630884043872356, + -0.0010948146227747202, + -0.007085552904754877, + -0.0006310140597634017, + -0.013073914684355259, + -0.0039152647368609905, + -0.000364713923772797, + -0.001292108790948987, + -0.004158303141593933, + -0.044283974915742874, + -0.05722038820385933, + -0.006369172595441341, + -0.0007976687629707158, + -0.005993015132844448, + -0.0004935238393954933, + -0.011310506612062454, + -0.002951553324237466, + -0.000387831823900342, + -0.000977038755081594, + -0.0036971091758459806, + -0.030511993914842606, + -0.04246694967150688, + -0.004863100592046976, + -0.0006927236099727452, + -0.005206122528761625, + -0.0005129451747052372, + -0.00894621666520834, + -0.0028565814718604088, + -0.00041333239641971886, + -0.0009002208826132119, + -0.0033131728414446115, + -0.021188799291849136, + -0.03330245241522789, + -0.0038543473929166794, + -0.0006504327175207436, + -0.004474864806979895, + -0.00048029806930571795, + -0.009718249551951885, + -0.0030443770810961723, + -0.0003743662964552641, + -0.0009439303539693356, + -0.003729770192876458, + -0.016505014151334763, + -0.0290373582392931, + -0.003315192647278309, + -0.0005821678787469864, + -0.004148805979639292, + -0.00042489083716645837, + -0.006856840569525957, + -0.0028660909738391638, + -0.00032574593205936253, + -0.0006986799417063594, + -0.003671098267659545, + -0.012792548164725304, + -0.02553274855017662, + -0.002730690874159336, + -0.0005067494930699468, + -0.0036923582665622234, + -0.0004451475979294628, + -0.007225453853607178, + -0.002746859099715948, + -0.0003323002893012017, + -0.0008517451351508498, + -0.003630714723840356, + -0.011284693144261837, + -0.02186688780784607, + -0.0025819556321948767, + -0.0004931663861498237, + -0.0031721293926239014, + -0.00040725519647821784, + -0.0062383925542235374, + -0.00238693761639297, + -0.00040749352774582803, + -0.0007970731821842492, + -0.003604583442211151, + -0.010075542144477367, + -0.022386692464351654, + -0.0025295186787843704, + -0.00046302087139338255, + -0.003138143103569746, + -0.0004233417857903987, + -0.006178564392030239, + -0.0021037368569523096 + ] + }, + "127": { + "input_prompt": "as before we talk about using , i usually do if else() to in case the property would be null before.\nif today i public a property and i want to use using to instead , will code be like this?\n\npublic Material Mat;\n\nusing(Mat){\n// Do something;\n}\n\nis that right?", + "generated_text": " i am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using the wrong syntax or if i am using the syntax correctly but i am not sure if i am using the syntax correctly or if i am using the syntax incorrectly.\n\nI am not sure if i am using", + "generated_tokens": [ + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 8462, + 22692, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1809, + 1623, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 17047, + 1505, + 1693, + 1623, + 1855, + 2505, + 1278, + 22692, + 73751, + 1338, + 1073, + 1855, + 1605, + 5257, + 1693, + 1623, + 1855, + 2505 + ], + "latency": 151.56505846977234, + "cuda_graph_request_count_map": null, + "step_count": 6144, + "logprobs": [ + -7.793755531311035, + -4.224940776824951, + -5.446587562561035, + -0.267395555973053, + -6.118141174316406, + -11.105525970458984, + -6.67517614364624, + -6.902530670166016, + -3.2218151092529297, + -5.99644660949707, + -8.431673049926758, + -8.04836368560791, + -3.992594003677368, + -7.550079345703125, + -2.873685836791992, + -3.3877997398376465, + -7.9306745529174805, + -7.005484580993652, + -1.0481306314468384, + -2.686237335205078, + -6.131283760070801, + -6.2994704246521, + -7.931419372558594, + -11.3147554397583, + -8.544670104980469, + -12.01729679107666, + -3.89847469329834, + -1.7964364290237427, + -2.952878952026367, + -1.9217232465744019, + -2.272329330444336, + -0.37552154064178467, + -1.667820692062378, + -7.510344505310059, + -3.498040199279785, + -7.980632305145264, + -7.672002792358398, + -4.4999470710754395, + -7.155375003814697, + -2.4486124515533447, + -4.785946846008301, + -1.153855800628662, + -2.3994438648223877, + -4.250652313232422, + -12.24446964263916, + -8.344388008117676, + -2.608186721801758, + -5.200589179992676, + -8.25888442993164, + -3.6245617866516113, + -7.689338207244873, + -7.345355033874512, + -1.2661759853363037, + -7.265620231628418, + -1.9884108304977417, + -6.269482612609863, + -2.41705584526062, + -1.8929681777954102, + -1.8259913921356201, + -2.0997350215911865, + -2.323200225830078, + -1.3998825550079346, + -0.8789899945259094, + -1.082053542137146, + -1.1831339597702026, + -1.4462857246398926, + -1.6481035947799683, + -1.4408715963363647, + -1.2603964805603027, + -1.5267670154571533, + -1.6345772743225098, + -1.3796477317810059, + -0.7609691023826599, + -0.3548354506492615, + -0.7552334666252136, + -0.44776833057403564, + -1.1078286170959473, + -1.3036658763885498, + -0.5214896202087402, + -0.8486822843551636, + -0.22470997273921967, + -0.4705755412578583, + -0.5639711022377014, + -0.5388108491897583, + -0.6052999496459961, + -0.1002030223608017, + -0.286334365606308, + -0.45798981189727783, + -1.0107953548431396, + -0.11875647306442261, + -0.6969441771507263, + -0.4609107971191406, + -0.07614769786596298, + -0.5035472512245178, + -0.1682187020778656, + -0.10476160794496536, + -0.6586751341819763, + -0.35806939005851746, + -1.5364394187927246, + -2.4093759059906006, + -1.977368950843811, + -1.6216907501220703, + -0.27647316455841064, + -0.2991848587989807, + -0.2783535420894623, + -0.05913994088768959, + -0.03023873083293438, + -0.043339803814888, + -0.7320341467857361, + -0.0030677898321300745, + -0.0332595594227314, + -0.012804670259356499, + -0.004041599575430155, + -0.0014899593079462647, + -0.001948602613992989, + -0.0029070996679365635, + -0.040939707309007645, + -0.013942227698862553, + -0.04897322878241539, + -0.011005887761712074, + -0.0044113704934716225, + -0.0013179434463381767, + -0.003658389439806342, + -0.009758152067661285, + -0.0014104428701102734, + -0.0016671819612383842, + -0.000771939754486084, + -0.0015519729349762201, + -0.003720743814483285, + -0.004249115474522114, + -0.00485657574608922, + -0.005053604021668434, + -0.002336274366825819, + -0.0009155849111266434, + -0.0004978132783435285, + -0.0005953923100605607, + -0.0011395872570574284, + -0.001485078944824636, + -0.3072909712791443, + -1.7295066118240356, + -0.4807289242744446, + -0.1245415136218071, + -0.011858444660902023, + -0.020613837987184525, + -0.011020978912711143, + -0.003106294432654977, + -0.0009966888464987278, + -0.0019349202048033476, + -0.037407051771879196, + -0.0003496989083942026, + -0.005922981072217226, + -0.007394562941044569, + -0.0006037319544702768, + -0.0008836655179038644, + -0.0002884448622353375, + -0.00047600860125385225, + -0.0024947968777269125, + -0.00442774873226881, + -0.004059052560478449, + -0.0018594847060739994, + -0.0006179092451930046, + -0.00022635281493421644, + -0.0006730675231665373, + -0.003022746881470084, + -0.0002343380037928, + -0.00047791501856409013, + -9.440929716220126e-05, + -0.00021550717065110803, + -0.0013523490633815527, + -0.0032202552538365126, + -0.001157686347141862, + -0.004449942149221897, + -0.0016590891173109412, + -0.00101062236353755, + -0.0003079893649555743, + -0.00048375347978435457, + -0.0021734442561864853, + -0.00423036003485322, + -0.11514264345169067, + -0.8658493757247925, + -0.084366075694561, + -0.02140468917787075, + -0.0060798698104918, + -0.008638513274490833, + -0.003212531330063939, + -0.0009598892065696418, + -0.00032085992279462516 + ] + }, + "throughput": [107.66332959870442, 107.66332959870442] +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_583m_logitsmatch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_583m_logitsmatch/model_config.yaml new file mode 100644 index 00000000000..6230e63d755 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_583m_logitsmatch/model_config.yaml @@ -0,0 +1,65 @@ +# Mirrors legacy test_case: gpt_dynamic_inference_tp1_pp1_583m_logitsmatch +# (driven by examples/inference/legacy/gpt_dynamic_inference.py). +# This case drives examples/inference/offline_inference.py in sync direct mode +# (no --use-coordinator, no --async-mode). Goldens copied from the legacy case; +# outputs must match byte-for-byte since the code path differs only in entry wrapper. +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: frozen-start +MODE: inference +MODEL_ARGS: + --tiktoken-pattern: v2 + --use-mcore-models: true + --tokenizer-type: TikTokenizer + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --auto-detect-ckpt-format: true + --max-tokens-to-oom: 3600000 + --inference-max-seq-length: 4096 + --attention-backend: flash + --use-checkpoint-args: true + --micro-batch-size: 1 + --no-load-optim: true + --no-use-tokenizer-model-from-checkpoint-args: true + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1 + --distributed-backend: nccl + --log-interval: 1 + --transformer-impl: transformer_engine + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 + --deterministic-mode: true + --ckpt-format: torch_dist + --bf16: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --num-layers: 24 + --hidden-size: 1152 + --num-attention-heads: 16 + --max-position-embeddings: 1024 + --seq-length: 1024 + --temperature: 1.0 + --top_k: 1 + --return-log-probs: true + --inference-dynamic-batching-max-requests-override: 8 # hardcode decode padding tokens to 7 for reproducibility + --inference-dynamic-batching-buffer-guaranteed-fraction: 0.05 + --inference-dynamic-batching-buffer-overflow-factor: 1.2 + --inference-dynamic-batching-buffer-size-gb: 20 + --dist-ckpt-strictness: log_unexpected + --inference-ckpt-non-strict: true # To handle the extra_state errors + --output-path: ${INFERENCE_OUTPUT_PATH} + --output-every-n-results: 32 + --prompt-file: ${DATA_PATH}/text/sharegpt-vicuna/filtered/processed.jsonl + --prompt-file-num-truncate: 128 # originally 1024 + --num-tokens-to-generate: 128 # originally 512 + --incoming-requests-per-step: 32 + --termination-id: -1 + --inference-repeat-n: 3 + --inference-logging-step-interval: 1 +METRICS: + - "generated_tokens" + - "logprobs" diff --git a/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_dp8_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_dp8_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..92eb5c6cab0 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_dp8_583m_logitsmatch_zmq/golden_values_dev_dgx_h100.json @@ -0,0 +1,158 @@ +{ + "0": { + "input_prompt": "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies.", + "generated_text": " And that this is the place where you can be yourself, and be yourself in the most beautiful way. And that this is the place where you can", + "generated_tokens": [ + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710, + 1402, + 14019, + 1044, + 1321, + 1402, + 14019, + 1294, + 1278, + 2725, + 15568, + 3039, + 1046, + 3060, + 1455, + 1593, + 1395, + 1278, + 3535, + 2478, + 1636, + 1710 + ], + "latency": 1.964757651090622, + "logprobs": [ + -9.358587265014648, + -2.7594826221466064, + -4.608366012573242, + -1.4093360900878906, + -0.6152952313423157, + -1.7217562198638916, + -2.496668815612793, + -2.0547454357147217, + -2.441960573196411, + -6.280838966369629, + -1.5643692016601562, + -3.462346076965332, + -4.428728103637695, + -3.8633861541748047, + -1.9936373233795166, + -1.8929449319839478, + -3.796365737915039, + -6.8360137939453125, + -0.2901247441768646, + -0.9246833324432373, + -6.633338928222656, + -7.166708469390869, + -12.771251678466797, + -2.198296308517456, + -3.7778120040893555, + -0.4983733296394348, + -4.381269454956055, + -0.0666784718632698, + -0.09580295532941818, + -3.2437636852264404, + -10.079947471618652, + -1.172220230102539, + -5.977442741394043, + -5.046236038208008, + -3.855658531188965, + -2.5585858821868896, + -3.356245994567871, + -5.557229518890381, + -1.6787731647491455, + -5.483290672302246, + -12.218501091003418, + -12.61402702331543, + -0.09662941098213196, + -2.5431432723999023, + -1.4071024656295776, + -2.9154715538024902, + -1.1964417695999146, + -0.006458481773734093, + -3.3625335693359375, + -13.262511253356934, + -4.314079761505127, + -2.617699146270752, + -5.987792015075684, + -0.778266429901123, + -0.048888545483350754, + -1.548882007598877, + -1.1381981372833252, + -5.627166748046875, + -0.4078553318977356, + -4.958505630493164, + -0.6187160611152649, + -0.7174848914146423, + -2.469533920288086, + -13.620073318481445, + -0.09088654816150665, + -3.526974678039551, + -1.4195809364318848, + -6.402483940124512, + -0.5898402333259583, + -3.565917491912842, + -0.8561318516731262, + -1.6140165328979492, + -5.370549201965332, + -17.159223556518555, + -6.583524703979492, + -0.8855001926422119, + -4.19431209564209, + -1.2012220621109009, + -2.2563133239746094, + -1.7674944400787354, + -0.22064533829689026, + -9.292220115661621, + -0.12445646524429321, + -7.29617977142334, + -2.526529312133789, + -4.071560859680176, + -3.5568013191223145, + -1.926215410232544, + -2.349026918411255, + -2.2132363319396973, + -0.3125414550304413, + -1.4718132019042969, + -2.149106740951538, + -1.0855519771575928, + -1.631832242012024, + -1.3751734495162964, + -1.9396103620529175, + -1.5293723344802856, + -0.8444125056266785, + -1.2414811849594116, + -1.9522171020507812, + -2.4338042736053467, + -1.5651824474334717, + -0.9498789310455322, + -1.8044980764389038, + -2.356677770614624, + -1.247452974319458, + -1.550165057182312, + -0.5635553598403931, + -0.6177330017089844, + -0.4778785705566406, + -0.020452087745070457, + -0.48500269651412964, + -0.23854275047779083, + -0.06543659418821335, + -0.11837350577116013, + -0.0585334412753582 + ] + } +} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml new file mode 100644 index 00000000000..f5d1395ed44 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_offline_inference_sync_tp1_pp1_dp8_583m_logitsmatch_zmq/model_config.yaml @@ -0,0 +1,61 @@ +# Mirrors legacy test_case: gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq +# (driven by examples/inference/legacy/gpt_dynamic_inference_with_coordinator.py). +# This case drives examples/inference/offline_inference.py in sync coordinator mode +# (--use-coordinator, no --async-mode). Goldens copied from the legacy case; +# outputs must match byte-for-byte since the engine path is unchanged. +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: frozen-start +MODE: inference +MODEL_ARGS: + --tiktoken-pattern: v2 + --use-mcore-models: true + --tokenizer-type: TikTokenizer + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --auto-detect-ckpt-format: true + --max-tokens-to-oom: 3600000 + --inference-max-seq-length: 4096 + --attention-backend: flash + --use-checkpoint-args: true + --micro-batch-size: 1 + --no-load-optim: true + --no-use-tokenizer-model-from-checkpoint-args: true + --timing-log-level: 0 + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ + --distributed-backend: nccl + --log-interval: 1 + --transformer-impl: inference_optimized + --sequence-parallel: true + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 + --deterministic-mode: true + --ckpt-format: torch_dist + --bf16: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --num-layers: 24 + --hidden-size: 1152 + --num-attention-heads: 16 + --max-position-embeddings: 1024 + --seq-length: 1024 + --temperature: 1.0 + --top_k: 1 + --return-log-probs: true + --num-tokens-to-generate: 30 + --inference-dynamic-batching-buffer-size-gb: 20 + --dist-ckpt-strictness: log_unexpected + --inference-ckpt-non-strict: true # To handle the extra_state errors + --output-path: ${INFERENCE_OUTPUT_PATH} + --prompts: "Time travel to 2008, and go to a bar or a club or one of the myriad disco-basements on the Lower East Side that does not quite know which of those it is. Dance awkwardly in a room full of other glittered-up nerds, and wait for something to happen, buoyed on the feeling that this is the big swollen heart of life, that this is New York like the movies." + --incoming-requests-per-step: 32 + --use-flashinfer-fused-rope: true + --inference-logging-step-interval: 1 + --use-coordinator: true +METRICS: + - "generated_tokens" + - "logprobs" diff --git a/tests/test_utils/recipes/h100/gpt-offline-inference.yaml b/tests/test_utils/recipes/h100/gpt-offline-inference.yaml new file mode 100644 index 00000000000..5743dfb080b --- /dev/null +++ b/tests/test_utils/recipes/h100/gpt-offline-inference.yaml @@ -0,0 +1,77 @@ +type: basic +format_version: 1 +maintainers: [mcore] +loggers: [stdout] +spec: + name: '{test_case}_{environment}_{platforms}' + model: gpt + build: mcore-pyt-{environment} + nodes: 1 + gpus: 1 + n_repeat: 1 + platforms: dgx_a100 + script_setup: | + unset https_proxy + echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc + + # Checkout latest + cd /opt + rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm + git init + git remote add origin $MCORE_REPO + git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' + git fetch origin $MCORE_MR_COMMIT + git checkout $MCORE_MR_COMMIT + git rev-parse HEAD + # Checkout backwards-ref + cd /opt + rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy + git init + git remote add origin $MCORE_REPO + git fetch origin $MCORE_BACKWARDS_COMMIT + git checkout $MCORE_BACKWARDS_COMMIT + git rev-parse HEAD + rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ + script: |- + ls + cd /opt/megatron-lm + + ARGUMENTS=( + "CHECKPOINT_LOAD_PATH=/mnt/artifacts" + "CHECKPOINT_SAVE_PATH=/tmp/checkpoints" + "DATA_PATH=/mnt/artifacts/" + "DATA_CACHE_PATH=/workspace/data/cache" + "TRAINING_SCRIPT_PATH=examples/inference/offline_inference.py" + "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml" + "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json" + "OUTPUT_PATH={assets_dir}" + "TENSORBOARD_PATH={assets_dir}/tensorboard" + "INFERENCE_OUTPUT_PATH={assets_dir}/golden_values_{environment}_{platforms}.json" + "N_REPEAT={n_repeat}" + "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}" + "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}" + ) + + bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}} + +products: + - test_case: [gpt_offline_inference_sync_tp1_pp1_583m_logitsmatch] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] + - test_case: [gpt_offline_inference_async_tp1_pp1_583m_logitsmatch] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] + - test_case: [gpt_offline_inference_sync_tp1_pp1_dp8_583m_logitsmatch_zmq] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] + - test_case: [gpt_offline_inference_async_tp1_pp1_dp8_583m_logitsmatch_zmq] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] From 739f3b0dfc36ef09e275087139ed1a2dfb8bd397 Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Thu, 7 May 2026 19:15:25 -0700 Subject: [PATCH 12/15] test(inference): add HTTP smoke test for launch_inference_server with bespoke driver Co-Authored-By: Claude Opus 4.7 (1M context) --- .../serve_smoke.py | 185 ++++++++++++++++++ .../h100/gpt-inference-server-smoke.yaml | 50 +++++ 2 files changed, 235 insertions(+) create mode 100644 tests/functional_tests/test_cases/gpt/gpt_inference_server_smoke_tp1_pp1_dp8_583m/serve_smoke.py create mode 100644 tests/test_utils/recipes/h100/gpt-inference-server-smoke.yaml diff --git a/tests/functional_tests/test_cases/gpt/gpt_inference_server_smoke_tp1_pp1_dp8_583m/serve_smoke.py b/tests/functional_tests/test_cases/gpt/gpt_inference_server_smoke_tp1_pp1_dp8_583m/serve_smoke.py new file mode 100644 index 00000000000..c030ba51d72 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_inference_server_smoke_tp1_pp1_dp8_583m/serve_smoke.py @@ -0,0 +1,185 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + +"""Smoke test for ``examples/inference/launch_inference_server.py``. + +Spawns the high-level-API server as a subprocess (TP=1, DP=8 on Mistral 0.5B), +tails its stdout for the readiness banner, sends one OpenAI-compatible +``/v1/completions`` request, and asserts on a 200 response with a non-empty +``choices[0].text``. Server is then SIGTERM'd and joined. + +No golden values: this is a pass/fail HTTP smoke. It validates the daemon-thread +CUDA-device fix, coordinator startup, frontend replicas, and request/response +round-trip end-to-end. +""" + +import argparse +import json +import os +import signal +import subprocess +import sys +import threading +import time +import urllib.error +import urllib.request + +READINESS_MARKER = "Running on http" +READINESS_TIMEOUT_S = 600 +REQUEST_TIMEOUT_S = 60 +SHUTDOWN_TIMEOUT_S = 60 +SERVER_HOST = "0.0.0.0" +SERVER_PORT = 5000 + + +def build_server_cmd(checkpoint_dir: str, tokenizer_model: str) -> list[str]: + """Build the torchrun command for ``launch_inference_server.py`` (Mistral 0.5B, + TP=1 DP=8). Mirrors gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq's + model_config.yaml so the same checkpoint that legacy dp8 inference tests use + is reused here. + """ + return [ + "torchrun", + "--nproc-per-node=8", + "-m", + "examples.inference.launch_inference_server", + "--tiktoken-pattern", "v2", + "--use-mcore-models", + "--tokenizer-type", "TikTokenizer", + "--tokenizer-model", tokenizer_model, + "--auto-detect-ckpt-format", + "--max-tokens-to-oom", "3600000", + "--inference-max-seq-length", "4096", + "--attention-backend", "flash", + "--use-checkpoint-args", + "--micro-batch-size", "1", + "--no-load-optim", + "--no-use-tokenizer-model-from-checkpoint-args", + "--load", checkpoint_dir, + "--distributed-backend", "nccl", + "--transformer-impl", "inference_optimized", + "--sequence-parallel", + "--tensor-model-parallel-size", "1", + "--pipeline-model-parallel-size", "1", + "--deterministic-mode", + "--ckpt-format", "torch_dist", + "--bf16", + "--num-layers", "24", + "--hidden-size", "1152", + "--num-attention-heads", "16", + "--max-position-embeddings", "1024", + "--seq-length", "1024", + "--inference-dynamic-batching-buffer-size-gb", "20", + "--dist-ckpt-strictness", "log_unexpected", + "--inference-ckpt-non-strict", + "--port", str(SERVER_PORT), + "--host", SERVER_HOST, + ] + + +def cleaned_env() -> dict: + """Strip torchrun-specific env vars so the spawned server's torchrun + starts a fresh distributed setup instead of inheriting a stale one. + """ + env = os.environ.copy() + for v in ( + "RANK", "LOCAL_RANK", "WORLD_SIZE", "LOCAL_WORLD_SIZE", + "MASTER_ADDR", "MASTER_PORT", + "TORCHELASTIC_RUN_ID", "TORCHELASTIC_RESTART_COUNT", + "TORCHELASTIC_MAX_RESTARTS", "TORCHELASTIC_USE_AGENT_STORE", + "TORCH_NCCL_ASYNC_ERROR_HANDLING", + ): + env.pop(v, None) + env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" + return env + + +def post_completion() -> dict: + body = json.dumps( + { + "model": "EMPTY", + "prompt": "Hello, world!", + "max_tokens": 10, + "temperature": 0.0, + } + ).encode() + req = urllib.request.Request( + f"http://localhost:{SERVER_PORT}/v1/completions", + data=body, + headers={"Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT_S) as resp: + if resp.status != 200: + raise AssertionError(f"server returned status {resp.status}") + return json.loads(resp.read()) + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint-dir", required=True) + parser.add_argument("--tokenizer-model", required=True) + args = parser.parse_args() + + cmd = build_server_cmd(args.checkpoint_dir, args.tokenizer_model) + print(f"[smoke] spawning server: {' '.join(cmd)}", flush=True) + + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=cleaned_env(), + ) + + ready = threading.Event() + + def watch(): + for line in proc.stdout: + print(f"[server] {line}", end="", flush=True) + if READINESS_MARKER in line: + ready.set() + + watcher = threading.Thread(target=watch, daemon=True) + watcher.start() + + rc = 1 + try: + if not ready.wait(READINESS_TIMEOUT_S): + print( + f"[smoke] FAIL: readiness banner not seen in {READINESS_TIMEOUT_S}s", + flush=True, + ) + return rc + + # Allow a beat after the readiness banner for all 4 frontend replicas + # to be reachable. + time.sleep(2) + + print("[smoke] sending /v1/completions request", flush=True) + body = post_completion() + choices = body.get("choices") or [] + if not choices: + print(f"[smoke] FAIL: no choices in response: {body}", flush=True) + return rc + text = choices[0].get("text", "") + if not text: + print(f"[smoke] FAIL: empty completion text: {body}", flush=True) + return rc + + print(f"[smoke] PASS: completion={text!r}", flush=True) + rc = 0 + finally: + if proc.poll() is None: + proc.send_signal(signal.SIGTERM) + try: + proc.wait(timeout=SHUTDOWN_TIMEOUT_S) + except subprocess.TimeoutExpired: + print("[smoke] server didn't exit on SIGTERM; SIGKILL", flush=True) + proc.kill() + proc.wait() + return rc + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_utils/recipes/h100/gpt-inference-server-smoke.yaml b/tests/test_utils/recipes/h100/gpt-inference-server-smoke.yaml new file mode 100644 index 00000000000..9059deff50b --- /dev/null +++ b/tests/test_utils/recipes/h100/gpt-inference-server-smoke.yaml @@ -0,0 +1,50 @@ +type: basic +format_version: 1 +maintainers: [mcore] +loggers: [stdout] +spec: + name: '{test_case}_{environment}_{platforms}' + model: gpt + build: mcore-pyt-{environment} + nodes: 1 + gpus: 8 + n_repeat: 1 + platforms: dgx_a100 + script_setup: | + unset https_proxy + echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc + + # Checkout latest + cd /opt + rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm + git init + git remote add origin $MCORE_REPO + git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*' + git fetch origin $MCORE_MR_COMMIT + git checkout $MCORE_MR_COMMIT + git rev-parse HEAD + # Checkout backwards-ref + cd /opt + rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy + git init + git remote add origin $MCORE_REPO + git fetch origin $MCORE_BACKWARDS_COMMIT + git checkout $MCORE_BACKWARDS_COMMIT + git rev-parse HEAD + rm -rf megatron; cp -a /opt/megatron-lm/megatron ./ + script: |- + cd /opt/megatron-lm + + # The driver is plain Python (NOT under torchrun) -- it spawns its own + # torchrun-based server subprocess and then sends an HTTP request. + uv run python \ + tests/functional_tests/test_cases/gpt/gpt_inference_server_smoke_tp1_pp1_dp8_583m/serve_smoke.py \ + --checkpoint-dir /workspace/data/model/mcore_mistral \ + --tokenizer-model /workspace/data/model/mcore_mistral/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + +products: + - test_case: [gpt_inference_server_smoke_tp1_pp1_dp8_583m] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] From 8ff9a266097416993c104fb8e6098cc04e915b64 Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Thu, 7 May 2026 23:03:32 -0700 Subject: [PATCH 13/15] docs(inference): add README for the high-level inference API Co-Authored-By: Claude Opus 4.7 (1M context) --- megatron/inference/README.md | 93 ++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 megatron/inference/README.md diff --git a/megatron/inference/README.md b/megatron/inference/README.md new file mode 100644 index 00000000000..d89f4fff81a --- /dev/null +++ b/megatron/inference/README.md @@ -0,0 +1,93 @@ +# Megatron Inference (High-Level API) + +High-level entry points over the `megatron.core.inference` dynamic +engine. Hides `DynamicInferenceContext` + `GPTInferenceWrapper` + +`TextGenerationController` + `DynamicInferenceEngine` construction, +coordinator startup, and the per-instance background asyncio runtime behind +two top-level classes: `MegatronLLM` (sync) and `MegatronAsyncLLM` (async, +with HTTP serving via `serve()`). + +Use this package when you want the typical `llm.generate(prompts, ...)` +ergonomic. Drop down to `megatron.core.inference` directly when you need +manual `add_request` / `step_modern` control or step-level scheduling. + +## Quickstart + +### Offline batch (sync) + +```python +from megatron.inference import MegatronLLM, SamplingParams + +# Caller owns initialize_megatron(...), model construction, and model.eval(). +# See examples/inference/offline_inference.py for a runnable end-to-end script. +with MegatronLLM( + model=model, + tokenizer=tokenizer, + inference_config=inference_config, + use_coordinator=False, +) as llm: + results = llm.generate( + ["Megatron inference is", "Hello, world"], + SamplingParams(num_tokens_to_generate=64), + ) + for r in results: + print(r.generated_text) +``` + +### OpenAI-compatible HTTP server + +```python +import asyncio +from megatron.inference import MegatronAsyncLLM, ServeConfig + +async def main(): + async with MegatronAsyncLLM( + model=model, + tokenizer=tokenizer, + inference_config=inference_config, + use_coordinator=True, # serve() requires coordinator mode + ) as llm: + await llm.serve(ServeConfig(host="0.0.0.0", port=5000)) # blocks until shutdown + +asyncio.run(main()) +``` + +## Mental model + +| Class × `use_coordinator` | Use case | +|---|---| +| `MegatronLLM`, direct (default) | Offline batch on ranks the caller manages (DP sharding owned by user). Blocking. | +| `MegatronLLM`, coordinator | Same offline workload with engine-managed DP routing + `pause`/`suspend`/`resume` lifecycle. | +| `MegatronAsyncLLM`, direct | Same as sync direct but `await`-able. Single-caller in direct mode (concurrent `generate` raises). | +| `MegatronAsyncLLM`, coordinator | Required for `serve()` and for RL-style persistent generators. | + +## Public API + +| Symbol | Purpose | +|---|---| +| `MegatronLLM` | Sync entry. Methods: `generate`, `pause`/`unpause`/`suspend`/`resume`, `shutdown`/`wait_for_shutdown`. Properties: `engine`, `context`, `controller`, `is_primary_rank`. Context-manager protocol. | +| `MegatronAsyncLLM` | Async-flavored equivalent. Adds `serve(serve_config, blocking=True)` for HTTP. | +| `ServeConfig` | Dataclass for the HTTP frontend. Fields: `host` (`"0.0.0.0"`), `port` (`5000`), `parsers` (`[]`), `verbose` (`False`), `frontend_replicas` (`4`). | +| `SamplingParams`, `DynamicInferenceRequest`, `DynamicInferenceRequestRecord` | Re-exports from `megatron.core.inference`. | + +## Caller responsibilities + +- Call `initialize_megatron(...)` (full Megatron distributed setup) BEFORE construction. +- Call `model.eval()` BEFORE construction. The class does not toggle model state. +- Lifecycle methods (`pause`/`unpause`/`suspend`/`resume`) require `use_coordinator=True`; they raise `RuntimeError` in direct mode. + +## Known limitations + +- **`MegatronAsyncLLM.generate()` blocks the caller's event loop in direct mode.** The engine call is sync and inline; it does not yield back while running. Acceptable for offline batched calls; degraded for server/RL workloads that interleave generation with other async work. Tracked for an upstream `engine.async_generate(...)`. + +- **`llm.engine.reset()` is unsafe in coordinator mode.** Two failure modes, both upstream in `dynamic_engine.py`: + - *Deadlock*: `reset()` *rebinds* (does not mutate in-place) `_cond` / `_state_events`. Any coroutine on the engine-loop task that is `await`ing one of those primitives holds a reference to the OLD object in its suspended frame. Subsequent `notify_all()` / `set()` calls hit the NEW objects, leaving the suspended waiter stranded; the next `generate()` hangs. + - *Silent corruption*: `reset()` also sets `self.use_coordinator = False`, which silently re-routes failed-request handling, scheduling notification, and `suspend()`'s state machine to direct-mode branches. Outcome: not-a-hang but wrong behavior, harder to diagnose. + - The example `offline_inference.py` blocks `--inference-repeat-n > 1` with `--use-coordinator` for these reasons. Direct-mode reset is safe. + +- **Async-direct `generate()` is single-caller.** Concurrent `await llm.generate(...)` (e.g. via `asyncio.gather`) in direct mode raises `RuntimeError`. Pass batched prompts instead, or switch to coordinator mode. + +## See also + +- Examples: [`examples/inference/offline_inference.py`](../../examples/inference/offline_inference.py) (4 modes via `--mode` / `--use-coordinator`), [`examples/inference/launch_inference_server.py`](../../examples/inference/launch_inference_server.py) (HTTP server). +- Low-level engine: [`megatron/core/inference/`](../core/inference/). From 9e7fae3292bbe9c48c0c7759cfff23963c2ec96c Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Thu, 7 May 2026 23:08:15 -0700 Subject: [PATCH 14/15] docs(inference): rewrite examples README and remove stale llama_mistral inference sections Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/llama_mistral.md | 16 -- examples/inference/README.md | 373 +++++++++-------------------------- 2 files changed, 91 insertions(+), 298 deletions(-) diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md index 2754405610c..9fedc0c9e9a 100644 --- a/docs/llama_mistral.md +++ b/docs/llama_mistral.md @@ -210,14 +210,6 @@ python Megatron-Bridge/examples/conversion/convert_checkpoints.py import \ After this conversion, we are ready to load the checkpoints into a Megatron GPT model. -## (Optional) Validate checkpoints - -A Megatron-LM text generation server for Llama3 can be launched using the script `examples/inference/llama_mistral/run_text_generation_llama3.sh `. For Llama3.1, please use `examples/inference/llama_mistral/run_text_generation_llama3.1.sh`. - -Once running, query the server with `curl 'http://:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8' -d '{"prompts":[""], "tokens_to_generate":100, "top_k":1}'`. - -A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/llama_mistral/huggingface_reference.py --model_path --prompt `. - ## Launch model If loading for either inference or finetuning, use the following arguments for Llama 3.0: @@ -314,14 +306,6 @@ python Megatron-Bridge/examples/conversion/convert_checkpoints.py import \ After this conversion, we are ready to load the checkpoints into a Megatron GPT model. -## (Optional) Validate checkpoints - -A Megatron-LM text generation server for Mistral-7B can be launched using the script `examples/inference/llama_mistral/run_text_generation_mistral.sh `. - -Once running, query the server with `curl 'http://:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8' -d '{"prompts":[""], "tokens_to_generate":100, "top_k":1}'`. - -A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/inference/llama_mistral/huggingface_reference.py --model_path --prompt `. - ## Launch model If loading for either inference or finetuning, use the following arguments: diff --git a/examples/inference/README.md b/examples/inference/README.md index 290b07440ab..98a1afbcba6 100644 --- a/examples/inference/README.md +++ b/examples/inference/README.md @@ -1,303 +1,112 @@ ### Megatron Core Inference Documentation This guide provides an example for Megatron Core for running model inference. -### Folder structure -``` -examples/inference - legacy/ - gpt_dynamic_inference_with_coordinator.py - gpt_dynamic_inference.py - gpt_static_inference.py - simple_t5_batch_inference.py - offline_inference.py - launch_inference_server.py - utils.py - run_offline_inference.sh - run_inference_server.sh -``` - ### Contents -- [Megatron Core Inference Documentation](#megatron-core-inference-documentation) -- [Contents](#contents) - - [1. Quick Start](#1-quick-start) - - [1.1 Understanding The Code](#11-understanding-the-code) - - [1.2 Running The Code](#12-running-the-code) - - [2. Flow of Control In MCore Backend](#2-flow-of-control-in-mcore-backend) - - [3. Customizing The Inference Pipeline](#3-customizing-the-inference-pipeline) - - [3.1. Create Your Own Inference Backend](#31-create-your-own-inference-backend) - - [3.2. Create Your Own Text Generation Controller](#32-create-your-own-text-generation-controller) - - [3.3. Support Other Models](#33-support-other-models) - - [3.3. Modify Inference Parameters](#33-modify-inference-parameters) - - [4. Future work](#4-future-work) - -
- -#### 1. Quickstart -This example runs statically-batched inference on a model trained using Megatron Core. The entrypoint is [gpt_static_inference.py](./gpt/gpt_static_inference.py). A similar workflow can be adapted for [gpt_dynamic_inference.py](./gpt/gpt_dynamic_inference.py). - -
- -##### 1.1 Code Walkthrough -***STEP 1 - Initialize model parallel and other default arguments*** -The micro batch size defaults to 1. It is not used in tensor-parallelism only, and for pipeline-parallel models it is calculated at runtime. -```python -# Initialize Megatron model using the same model provider from training. - initialize_megatron( - args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1} - ) -``` - -***STEP 2 - Load the model using the model_provider_function*** -The model provider function supports both MCore and Legacy models. -```python - # Load the model checkpoint - model = get_model(model_provider, wrap_with_ddp=False) - load_checkpoint(model, None, None) - model.eval() - model = model[0] +- [What's in here](#whats-in-here) +- [Offline inference](#offline-inference) +- [OpenAI-compatible inference server](#openai-compatible-inference-server) +- [Legacy examples](#legacy-examples) +- [See also](#see-also) + +### What's in here + +These examples drive the high-level inference API in `megatron/inference/` +(`MegatronLLM` for sync, `MegatronAsyncLLM` for async + HTTP serving). For +the API surface and mental model see +[`megatron/inference/README.md`](../../megatron/inference/README.md). + +The two top-level Python entrypoints cover all common workflows: + +- **`offline_inference.py`** — batched offline generation. Supports the + 4 mode combinations (sync/async × direct/coordinator) via CLI flags. + Replaces the legacy `gpt_dynamic_inference.py` and + `gpt_dynamic_inference_with_coordinator.py` paths. +- **`launch_inference_server.py`** — OpenAI-compatible HTTP server using + `MegatronAsyncLLM.serve(...)`. Replaces the legacy + `tools/run_dynamic_text_generation_server.py` path. + +`utils.py` holds shared helpers (`Request`, `build_requests`, +`build_dynamic_engine_setup_prefix`, output formatting, JSON dump) used by +both new examples and by the legacy scripts. + +### Offline inference + +`offline_inference.py` runs synthetic-load inference on a Megatron model and +prints a setup-prefix line, a "Unique prompts + outputs" table, and a +throughput summary. Optional JSON dump for regression testing via +`--output-path`. + +The shell wrapper `run_offline_inference.sh` packages the typical Qwen +2.5-1.5B configuration. Required CLI args: `--hf-token`, `--checkpoint`. +Optional: `--mode sync|async` (default `sync`), `--use-coordinator` (default +off, i.e. direct mode), `--nproc ` (default `8`). + +```bash +# sync + direct (defaults) +bash examples/inference/run_offline_inference.sh \ + --hf-token --checkpoint /path/to/qwen-1.5b + +# sync + coordinator +bash examples/inference/run_offline_inference.sh \ + --hf-token --checkpoint /path/to/qwen-1.5b --use-coordinator + +# async + direct +bash examples/inference/run_offline_inference.sh \ + --hf-token --checkpoint /path/to/qwen-1.5b --mode async + +# async + coordinator +bash examples/inference/run_offline_inference.sh \ + --hf-token --checkpoint /path/to/qwen-1.5b --mode async --use-coordinator ``` -***STEP 3 - Choose an engine*** -Text generation requires an inference engine, which includes a scheduler. The default engine is the [Megatron Core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [text generation controller](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py). TRTLLMEngine will be supported in the future. -```python - # Create an inference wrapper to setup the model. - inference_wrapped_model = GPTInferenceWrapper(model, args) - - # Define a sampling loop. - text_generation_controller = TextGenerationController( - inference_wrapped_model=inference_wrapped_model, - tokenizer=tokenizer - ) - - # Create a static or dynamic inference engine. - inference_engine = StaticInferenceEngine( - text_generation_controller=text_generation_controller, - max_batch_size=args.max_batch_size -) -``` +All four modes produce numerically identical generated text. The high-level +API rejects `--use-coordinator` with `--inference-repeat-n > 1` (engine +reset is unsafe in coordinator mode — see +[`megatron/inference/README.md`](../../megatron/inference/README.md)). -***STEP 4 - Run text generation*** -The [SamplingParams](../../megatron/core/inference/sampling_params.py) class uses suggested defaults. Customize this to change top_p, top_k, number of tokens to generate, etc. The result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py). -```python - results: List[InferenceRequest] = inference_engine.generate( - prompts=args.prompts, sampling_params=sampling_params - ) - - if torch.distributed.get_rank() == 0: - for idx, result in enumerate(results): - print(f' ------------- RESULT FOR PROMPT {idx} --------------- ') - result = { - 'id': result.request_id, - 'input_prompt': result.prompt, - 'generated_text': result.generated_text, - 'generated_tokens' : result.generated_tokens - } - print(result) -``` +### OpenAI-compatible inference server -
+`launch_inference_server.py` uses `MegatronAsyncLLM.serve(blocking=True)` +on a coordinator-backed engine. The HTTP frontend exposes +`/v1/completions` and `/v1/chat/completions` on global rank 0. -##### 1.2 Running The Code -An example Slurm script is shown below. Set the tokenizer paths, inference params, and other settings appropriately. - -For a recap on sampling parameters, refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910). +The shell wrapper `run_inference_server.sh` packages the Nemotron-6 3B +hybrid MoE configuration (TP 2, EP 8, PP 1). Required CLI args: +`--hf-token`, `--hf-home`, `--checkpoint`. Optional: `--nproc ` (default +`8`). +```bash +bash examples/inference/run_inference_server.sh \ + --hf-token \ + --hf-home /path/to/hf_home \ + --checkpoint /path/to/nemotron-3b-hybrid-moe ``` -# Slurm cluster settings -ACCOUNT= -MLM_PATH=/path/to/megatron-lm -GPT_CKPT=/path/to/gpt/ckpt -VOCAB_MERGE_FILE_PATH=/path/to/vocab/and/merge/file -CONTAINER_IMAGE=nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11 - -srun --account $ACCOUNT \ ---job-name=$ACCOUNT:inference \ ---partition=batch \ ---time=01:00:00 \ ---container-image $CONTAINER_IMAGE \ ---container-mounts $MLM_PATH:/workspace/megatron-lm/,$GPT_CKPT:/workspace/mcore_gpt_ckpt,$VOCAB_MERGE_FILE_PATH:/workspace/tokenizer \ ---no-container-mount-home \ ---pty /bin/bash \ - -# Inside the container run the following. -cd megatron-lm/ -export CUDA_DEVICE_MAX_CONNECTIONS=1 - -TOKENIZER_ARGS=( - --vocab-file /workspace/tokenizer/gpt2-vocab.json - --merge-file /workspace/tokenizer/gpt2-merges.txt - --tokenizer-type GPT2BPETokenizer -) - -MODEL_ARGS=( - --use-checkpoint-args - --use-mcore-models - --load /workspace/mcore_gpt_ckpt -) - -INFERENCE_SPECIFIC_ARGS=( - --attention-dropout 0.0 - --hidden-dropout 0.0 - --num-tokens-to-generate 20 - --max-batch-size 4 -) - -torchrun --nproc-per-node=4 examples/inference/gpt/gpt_static_inference.py \ - ${TOKENIZER_ARGS[@]} \ - ${MODEL_ARGS[@]} \ - ${INFERENCE_SPECIFIC_ARGS[@]} \ - --prompts "prompt one " "sample prompt two" "sample prompt 3" - -NOTE: Other parameters which can be customized for inference: ---temperature (Sampling temperature) ---top_k (top_k sampling) ---top_p (top_p sampling) ---num-tokens-to-generate (Number of tokens to generate for each prompt) ---inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use microbatched pipelining.') ---use-dist-ckpt (If using dist checkpoint format for the model) +When the server is ready you'll see the readiness banner (~2 minutes after +launch on Nemotron-6 3B): ``` - - -
- - -#### 2. Control Flow in the MCore Backend -An example of inference with static batching is provided in [gpt_static_inference.py](./gpt/gpt_static_inference.py). -* [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function is called with the input prompts. -* The `Scheduler` in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until max batch size is hit. Remaining requests will be added to the waiting requests pool. -* The engine will run until all requests (waiting + active) are completed. - * The active requests are passed into **generate_all_output_tokens_static_batch()** of the text generation controller . - * This function uses the **prep_model_for_inference()** method of the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) and runs an autoregressive sampling loop - * In the autoregressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to slice out the input tokens and masks - * Input tokens and masks are passed it into the **run_one_forward_step()** method, which calls the model `.forward()` method to get the output logits - * Output logits are synchronized across all pipeline parallel ranks - * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the sampling parameters. - * The sampled tokens are then appended to the input prompt tokens for the next iteration - * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition - * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. - * The **update_requests_pool()** method of the scheduler moves completed requests into the completed request pool and waiting requests into the active request pool - -
- -#### 3. Customizing The Inference Pipeline - -The inference pipeline supports three levels of customization: - -* **Inference engine** - The MCore Engine supports static and dynamic batching. Modify this to add a new backend. -* **Text generation controller** - The main sampling loop. Customize this to support alternative tokenization or implement a new sampling strategy. -* **Inference Wrapped Model** - Change this to support a new model. -* **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, and other sampling parameters. - -
- -##### 3.1. Create Your Own Inference Backend -The [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file contains a `generate` method that can be extended to support a new backend. - -```python -class AbstractEngine(ABC): - @staticmethod - def generate(self) -> dict: - """The abstract backend's generate function. - - To define a new backend, implement this method and return the outputs as a dictionary. +INFO:root:Inference co-ordinator is ready to receive requests! +INFO:hypercorn.error:Running on http://0.0.0.0:5000 (CTRL + C to quit) ``` -
- -##### 3.2. Implement a new Sampling Loop - -The [TextGenerationController](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py) contains the main sampling loop and can be modified to support new tokenization, detokenization, or sampling strategies. - -``` python -class TextGenerationController: - - def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]: - """Utility to tokenize the input prompts""" +Send requests with any OpenAI-compatible client. The dynamic server +currently returns `"model": "EMPTY"` and does not validate the request +`model` field — pass anything you like. - def sample_from_logits( - self, - last_token_logits: torch.Tensor, - sampling_params: SamplingParams, - vocab_size: int, - generation_started : Optional[torch.Tensor] = None, - top_n_logprobs_dict: Dict[int, List[Dict[str, float]]] = None, - ) -> torch.Tensor: - """Samples the logits to generate outputs - - Given the logits of the last token, this function samples according to the parameters defined in sampling_params and returns the sampled tokens. If sampling_params.top_n_logprobs > 0 - at each step it also updates the top_n_logprobs_dict. - """ - - def update_generation_status( - self, - updated_prompts_tokens: torch.Tensor, - generation_started: torch.Tensor, - current_context_end_position: int, - is_generation_done_tensor: torch.Tensor, - generated_sequence_lengths: torch.Tensor, - ) -> torch.Tensor: - """Function to check which prompts have reached an end condition - - We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True . The generated sequence lengths increases as we keep generating, until that prompts hits an eod condition. The generation started status tensor helps us determine which prompts have started generating - """ - - def generate_all_output_tokens_static_batch( - self, active_requests: OrderedDict[int, InferenceRequest], - ) -> OrderedDict[int, InferenceRequest]: - """Utility to generate all the output tokens and probabilities for the prompts . - - This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests - """ - - def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str: - """Detokenize the output generations""" -``` +### Legacy examples -
- -##### 3.3. Support Other Models -Extend [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) to support other models. The abstract model wrapper implements: -* Forward method which calls the model `forward` method depending on model parallel settings -* Initializes the model and puts it in `.eval()` mode -* Setup for the input parameters (max batch size, max seq length) - -The following methods should be implemented: -```python -class AbstractModelInferenceWrapper: - def prep_model_for_inference(self, prompts_tokens: torch.Tensor): - """A utility function for preparing model for inference - - The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass - """ - - @abc.abstractclassmethod - def get_batch_for_context_window(self) -> List: - """Returns the input data for inference - - This function gets called iteratively in the inference loop. It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference. -``` - -Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of implementing this for GPTModel. - -
- -##### 3.3. Modify Inference Parameters -We use [common inference params](../../megatron/core/inference/sampling_params.py) for text generation. Customize this to change `top_p`, `top_k`, number of tokens to generate etc. Other attributes can be added for the inference loop as shown below. - -``` -from megatron.core.inference.sampling_params import SamplingParams - -c = SamplingParams(temperature=0.5) -c.add_attributes({'min_length':4, 'eod_id':153}) -``` +`legacy/` preserves the prior `gpt_dynamic_inference.py`, +`gpt_dynamic_inference_with_coordinator.py`, `gpt_static_inference.py`, and +`simple_t5_batch_inference.py` scripts as-is. They are still wired into +the existing recipes under `tests/test_utils/recipes/h100/{gpt,moe,mamba}-*-inference.yaml` +for backward compatibility. New work should target `offline_inference.py` +and `launch_inference_server.py`. -
+### See also -#### 4. Future work -The following features are planned for future releases. -* TRTLLM Engine support -* Continuous batching optimizations -* Speculative decoding \ No newline at end of file +- API reference: [`megatron/inference/README.md`](../../megatron/inference/README.md) +- Low-level engine: [`megatron/core/inference/`](../../megatron/core/inference/) +- Functional tests: `tests/functional_tests/test_cases/gpt/gpt_offline_inference_*` + `gpt_inference_server_smoke_*` +- Unit tests: `tests/unit_tests/inference/high_level_api/` From 34494c58e48cbc7f1534ba90ee08ee9724934508 Mon Sep 17 00:00:00 2001 From: YangFei1990 Date: Fri, 8 May 2026 11:48:38 -0700 Subject: [PATCH 15/15] ci(inference): satisfy linting, copyright-check, and build-docs Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/llama_mistral.md | 2 - .../serve_smoke.py | 92 +++++++++++-------- .../inference/high_level_api/__init__.py | 1 + .../test_async_llm_serve_guard.py | 4 +- .../high_level_api/test_event_loop_manager.py | 1 + .../high_level_api/test_lifecycle_guards.py | 8 +- .../high_level_api/test_normalize_prompts.py | 8 +- .../high_level_api/test_validation.py | 20 +--- 8 files changed, 69 insertions(+), 67 deletions(-) diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md index 9fedc0c9e9a..6f084084e81 100644 --- a/docs/llama_mistral.md +++ b/docs/llama_mistral.md @@ -37,12 +37,10 @@ Architecturally Llama-2, Llama-3 and Mistral-7b are very similar. As such Megatr - [Download Huggingface checkpoints](#download-huggingface-checkpoints) - [Convert checkpoint format](#convert-checkpoint-format) - [Huggingface format](#huggingface-format) - - [(Optional) Validate checkpoints](#optional-validate-checkpoints) - [Launch model](#launch-model) - [Mistral-7b](#mistral-7b) - [Download Huggingface checkpoints](#download-huggingface-checkpoints) - [Convert checkpoint format](#convert-checkpoint-format) - - [(Optional) Validate checkpoints](#optional-validate-checkpoints) - [Launch model](#launch-model) - [Other Llama-like model support](#other-llama-like-model-support) - [Known numerical differences](#known-numerical-differences) diff --git a/tests/functional_tests/test_cases/gpt/gpt_inference_server_smoke_tp1_pp1_dp8_583m/serve_smoke.py b/tests/functional_tests/test_cases/gpt/gpt_inference_server_smoke_tp1_pp1_dp8_583m/serve_smoke.py index c030ba51d72..f3029e26bb7 100644 --- a/tests/functional_tests/test_cases/gpt/gpt_inference_server_smoke_tp1_pp1_dp8_583m/serve_smoke.py +++ b/tests/functional_tests/test_cases/gpt/gpt_inference_server_smoke_tp1_pp1_dp8_583m/serve_smoke.py @@ -42,37 +42,59 @@ def build_server_cmd(checkpoint_dir: str, tokenizer_model: str) -> list[str]: "--nproc-per-node=8", "-m", "examples.inference.launch_inference_server", - "--tiktoken-pattern", "v2", + "--tiktoken-pattern", + "v2", "--use-mcore-models", - "--tokenizer-type", "TikTokenizer", - "--tokenizer-model", tokenizer_model, + "--tokenizer-type", + "TikTokenizer", + "--tokenizer-model", + tokenizer_model, "--auto-detect-ckpt-format", - "--max-tokens-to-oom", "3600000", - "--inference-max-seq-length", "4096", - "--attention-backend", "flash", + "--max-tokens-to-oom", + "3600000", + "--inference-max-seq-length", + "4096", + "--attention-backend", + "flash", "--use-checkpoint-args", - "--micro-batch-size", "1", + "--micro-batch-size", + "1", "--no-load-optim", "--no-use-tokenizer-model-from-checkpoint-args", - "--load", checkpoint_dir, - "--distributed-backend", "nccl", - "--transformer-impl", "inference_optimized", + "--load", + checkpoint_dir, + "--distributed-backend", + "nccl", + "--transformer-impl", + "inference_optimized", "--sequence-parallel", - "--tensor-model-parallel-size", "1", - "--pipeline-model-parallel-size", "1", + "--tensor-model-parallel-size", + "1", + "--pipeline-model-parallel-size", + "1", "--deterministic-mode", - "--ckpt-format", "torch_dist", + "--ckpt-format", + "torch_dist", "--bf16", - "--num-layers", "24", - "--hidden-size", "1152", - "--num-attention-heads", "16", - "--max-position-embeddings", "1024", - "--seq-length", "1024", - "--inference-dynamic-batching-buffer-size-gb", "20", - "--dist-ckpt-strictness", "log_unexpected", + "--num-layers", + "24", + "--hidden-size", + "1152", + "--num-attention-heads", + "16", + "--max-position-embeddings", + "1024", + "--seq-length", + "1024", + "--inference-dynamic-batching-buffer-size-gb", + "20", + "--dist-ckpt-strictness", + "log_unexpected", "--inference-ckpt-non-strict", - "--port", str(SERVER_PORT), - "--host", SERVER_HOST, + "--port", + str(SERVER_PORT), + "--host", + SERVER_HOST, ] @@ -82,10 +104,16 @@ def cleaned_env() -> dict: """ env = os.environ.copy() for v in ( - "RANK", "LOCAL_RANK", "WORLD_SIZE", "LOCAL_WORLD_SIZE", - "MASTER_ADDR", "MASTER_PORT", - "TORCHELASTIC_RUN_ID", "TORCHELASTIC_RESTART_COUNT", - "TORCHELASTIC_MAX_RESTARTS", "TORCHELASTIC_USE_AGENT_STORE", + "RANK", + "LOCAL_RANK", + "WORLD_SIZE", + "LOCAL_WORLD_SIZE", + "MASTER_ADDR", + "MASTER_PORT", + "TORCHELASTIC_RUN_ID", + "TORCHELASTIC_RESTART_COUNT", + "TORCHELASTIC_MAX_RESTARTS", + "TORCHELASTIC_USE_AGENT_STORE", "TORCH_NCCL_ASYNC_ERROR_HANDLING", ): env.pop(v, None) @@ -95,12 +123,7 @@ def cleaned_env() -> dict: def post_completion() -> dict: body = json.dumps( - { - "model": "EMPTY", - "prompt": "Hello, world!", - "max_tokens": 10, - "temperature": 0.0, - } + {"model": "EMPTY", "prompt": "Hello, world!", "max_tokens": 10, "temperature": 0.0} ).encode() req = urllib.request.Request( f"http://localhost:{SERVER_PORT}/v1/completions", @@ -146,10 +169,7 @@ def watch(): rc = 1 try: if not ready.wait(READINESS_TIMEOUT_S): - print( - f"[smoke] FAIL: readiness banner not seen in {READINESS_TIMEOUT_S}s", - flush=True, - ) + print(f"[smoke] FAIL: readiness banner not seen in {READINESS_TIMEOUT_S}s", flush=True) return rc # Allow a beat after the readiness banner for all 4 frontend replicas diff --git a/tests/unit_tests/inference/high_level_api/__init__.py b/tests/unit_tests/inference/high_level_api/__init__.py index e69de29bb2d..b5dff7b5663 100644 --- a/tests/unit_tests/inference/high_level_api/__init__.py +++ b/tests/unit_tests/inference/high_level_api/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. diff --git a/tests/unit_tests/inference/high_level_api/test_async_llm_serve_guard.py b/tests/unit_tests/inference/high_level_api/test_async_llm_serve_guard.py index 341a2465736..2141da9e73a 100644 --- a/tests/unit_tests/inference/high_level_api/test_async_llm_serve_guard.py +++ b/tests/unit_tests/inference/high_level_api/test_async_llm_serve_guard.py @@ -28,9 +28,7 @@ def fake_model_and_tokenizer(): class TestAsyncLLMServeGuard: @pytest.mark.asyncio - async def test_serve_requires_use_coordinator( - self, mock_pipeline, fake_model_and_tokenizer - ): + async def test_serve_requires_use_coordinator(self, mock_pipeline, fake_model_and_tokenizer): model, tok = fake_model_and_tokenizer llm = MegatronAsyncLLM(model=model, tokenizer=tok) # direct mode with pytest.raises(ValueError, match="requires use_coordinator=True"): diff --git a/tests/unit_tests/inference/high_level_api/test_event_loop_manager.py b/tests/unit_tests/inference/high_level_api/test_event_loop_manager.py index 73aecaeeda6..27a4c89df52 100644 --- a/tests/unit_tests/inference/high_level_api/test_event_loop_manager.py +++ b/tests/unit_tests/inference/high_level_api/test_event_loop_manager.py @@ -29,6 +29,7 @@ def test_submit_run_sync_run_async(self): mgr = _EventLoopManager() mgr.start() try: + async def coro(): return 42 diff --git a/tests/unit_tests/inference/high_level_api/test_lifecycle_guards.py b/tests/unit_tests/inference/high_level_api/test_lifecycle_guards.py index 76dffbc3968..573b4a82285 100644 --- a/tests/unit_tests/inference/high_level_api/test_lifecycle_guards.py +++ b/tests/unit_tests/inference/high_level_api/test_lifecycle_guards.py @@ -47,9 +47,7 @@ def _make_worker_instance(cls): class TestDirectModeLifecycleGuards: """Direct mode: pause/unpause/suspend/resume must raise; shutdown is a no-op.""" - def test_sync_lifecycle_raises_in_direct_mode( - self, mock_pipeline, fake_model_and_tokenizer - ): + def test_sync_lifecycle_raises_in_direct_mode(self, mock_pipeline, fake_model_and_tokenizer): model, tok = fake_model_and_tokenizer llm = MegatronLLM(model=model, tokenizer=tok) for method in ("pause", "unpause", "suspend", "resume"): @@ -89,9 +87,7 @@ async def test_async_generate_raises_on_worker_rank(self): class TestShutdownIdempotence: - def test_sync_shutdown_idempotent_in_direct_mode( - self, mock_pipeline, fake_model_and_tokenizer - ): + def test_sync_shutdown_idempotent_in_direct_mode(self, mock_pipeline, fake_model_and_tokenizer): model, tok = fake_model_and_tokenizer llm = MegatronLLM(model=model, tokenizer=tok) llm.shutdown() diff --git a/tests/unit_tests/inference/high_level_api/test_normalize_prompts.py b/tests/unit_tests/inference/high_level_api/test_normalize_prompts.py index 95de353d3d2..1f2ac13b34d 100644 --- a/tests/unit_tests/inference/high_level_api/test_normalize_prompts.py +++ b/tests/unit_tests/inference/high_level_api/test_normalize_prompts.py @@ -31,10 +31,10 @@ def test_empty_list_is_batch(self): @pytest.mark.parametrize( "bad_input", [ - {1, 2}, # set - 1.5, # float - [1.5], # list of floats (first elem is float) - {"k": "v"}, # dict + {1, 2}, # set + 1.5, # float + [1.5], # list of floats (first elem is float) + {"k": "v"}, # dict ], ) def test_unsupported_inputs_raise_typeerror(self, bad_input): diff --git a/tests/unit_tests/inference/high_level_api/test_validation.py b/tests/unit_tests/inference/high_level_api/test_validation.py index fb7d260b8f1..0f2243121e2 100644 --- a/tests/unit_tests/inference/high_level_api/test_validation.py +++ b/tests/unit_tests/inference/high_level_api/test_validation.py @@ -33,28 +33,16 @@ def test_coordinator_host_without_use_coordinator_raises( ): model, tok = fake_model_and_tokenizer with pytest.raises(ValueError, match="coordinator_host/port require use_coordinator=True"): - cls( - model=model, - tokenizer=tok, - use_coordinator=False, - coordinator_host="x", - ) + cls(model=model, tokenizer=tok, use_coordinator=False, coordinator_host="x") def test_coordinator_port_without_use_coordinator_raises( self, cls, mock_pipeline, fake_model_and_tokenizer ): model, tok = fake_model_and_tokenizer with pytest.raises(ValueError, match="coordinator_host/port require use_coordinator=True"): - cls( - model=model, - tokenizer=tok, - use_coordinator=False, - coordinator_port=5000, - ) - - def test_direct_mode_constructor_succeeds( - self, cls, mock_pipeline, fake_model_and_tokenizer - ): + cls(model=model, tokenizer=tok, use_coordinator=False, coordinator_port=5000) + + def test_direct_mode_constructor_succeeds(self, cls, mock_pipeline, fake_model_and_tokenizer): model, tok = fake_model_and_tokenizer llm = cls(model=model, tokenizer=tok) assert llm.is_primary_rank is True