diff --git a/.editorconfig b/.editorconfig index 7c1af01a1d8..5663b8fdbfc 100644 --- a/.editorconfig +++ b/.editorconfig @@ -45,7 +45,7 @@ insert_final_newline = unset trim_trailing_whitespace = unset insert_final_newline = unset -[tools/server/webui/**] +[tools/ui/**] indent_style = unset indent_size = unset end_of_line = unset diff --git a/.github/labeler.yml b/.github/labeler.yml index 2120672eee5..60aa51d2cc2 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -73,10 +73,10 @@ android: - changed-files: - any-glob-to-any-file: - examples/llama.android/** -server/webui: +server/ui: - changed-files: - any-glob-to-any-file: - - tools/server/webui/** + - tools/ui/** server: - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/build-and-test-snapdragon.yml b/.github/workflows/build-and-test-snapdragon.yml index deed8e808b7..ef3fe502fa7 100644 --- a/.github/workflows/build-and-test-snapdragon.yml +++ b/.github/workflows/build-and-test-snapdragon.yml @@ -58,14 +58,45 @@ jobs: name: llama-cpp-android-arm64-snapdragon path: pkg-snapdragon/llama.cpp + linux-iot-snapdragon: + runs-on: ubuntu-latest + container: + image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1' + defaults: + run: + shell: bash + + steps: + - name: Clone + uses: actions/checkout@v6 + with: + fetch-depth: 0 + lfs: false + + - name: Build Llama.CPP for Snapdragon Linux IoT + id: build_llama_cpp_snapdragon_linux + run: | + cp docs/backend/snapdragon/CMakeUserPresets.json . + cmake --preset arm64-linux-snapdragon-release -B build-snapdragon -DGGML_OPENCL=ON + cmake --build build-snapdragon -j $(nproc) + cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp + + - name: Upload Llama.CPP Snapdragon Linux IoT Build Artifact + if: ${{ always() && steps.build_llama_cpp_snapdragon_linux.outcome == 'success' }} + uses: actions/upload-artifact@v6 + with: + name: llama-cpp-linux-arm64-snapdragon + path: pkg-snapdragon/llama.cpp + test-snapdragon-qdc: - name: Test on QDC Android Device (${{ matrix.device }}) - needs: [android-ndk-snapdragon] - runs-on: ubuntu-slim + name: Test on QDC Device (${{ matrix.device }}) + needs: [android-ndk-snapdragon, linux-iot-snapdragon] + runs-on: ubuntu-24.04-arm + timeout-minutes: 90 strategy: fail-fast: false matrix: - device: [SM8750, SM8650, SM8850] + device: [SM8750, SM8850, QCS9075M] steps: - name: Checkout @@ -74,11 +105,11 @@ jobs: - name: Download build artifact uses: actions/download-artifact@v7 with: - name: llama-cpp-android-arm64-snapdragon + name: ${{ startsWith(matrix.device, 'QCS') && 'llama-cpp-linux-arm64-snapdragon' || 'llama-cpp-android-arm64-snapdragon' }} path: pkg-snapdragon/llama.cpp - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' cache: pip @@ -107,7 +138,8 @@ jobs: --test all \ --pkg-dir pkg-snapdragon/llama.cpp \ --model-url "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \ - --device ${{ matrix.device }} + --device ${{ matrix.device }} \ + ${{ startsWith(matrix.device, 'QCS') && '--retries 2 --retry-delay 300' || '' }} env: QDC_API_KEY: ${{ secrets.QDC_API_KEY }} diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index ee8fd41d7c7..2851c45601f 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -68,6 +68,8 @@ jobs: - name: Determine tag name id: tag uses: ./.github/actions/get-tag-name + env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} ggml-ci-nvidia-cuda: needs: determine-tag @@ -81,7 +83,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | nvidia-smi GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp @@ -98,7 +100,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp @@ -115,7 +117,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp @@ -205,7 +207,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp @@ -234,7 +236,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp @@ -251,7 +253,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp @@ -270,7 +272,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp @@ -291,7 +293,7 @@ jobs: MSYSTEM: UCRT64 CHERE_INVOKING: 1 PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }} - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create @@ -332,7 +334,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | source ./openvino_toolkit/setupvars.sh GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8b01798ae78..b438bf2196c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -36,13 +36,13 @@ env: CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON" jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml + ui-build: + name: Build UI + uses: ././.github/workflows/ui-build.yml macOS-cpu: needs: - - webui-build + - ui-build strategy: matrix: @@ -71,11 +71,11 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -114,7 +114,7 @@ jobs: ubuntu-cpu: needs: - - webui-build + - ui-build strategy: matrix: @@ -135,11 +135,11 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: ccache if: ${{ matrix.build != 's390x' }} @@ -192,7 +192,7 @@ jobs: ubuntu-vulkan: needs: - - webui-build + - ui-build strategy: matrix: @@ -211,11 +211,11 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -269,7 +269,7 @@ jobs: android-arm64: needs: - - webui-build + - ui-build runs-on: ubuntu-latest @@ -283,11 +283,11 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -347,7 +347,7 @@ jobs: ubuntu-24-openvino: needs: - - webui-build + - ui-build runs-on: ubuntu-24.04 @@ -370,11 +370,11 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -436,7 +436,7 @@ jobs: windows-cpu: needs: - - webui-build + - ui-build runs-on: windows-2025 @@ -452,11 +452,11 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -497,7 +497,7 @@ jobs: windows: needs: - - webui-build + - ui-build runs-on: windows-2025 @@ -522,11 +522,11 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -588,7 +588,7 @@ jobs: windows-cuda: needs: - - webui-build + - ui-build runs-on: windows-2022 @@ -601,11 +601,11 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: Install ccache uses: ggml-org/ccache-action@v1.2.21 @@ -668,7 +668,7 @@ jobs: windows-sycl: needs: - - webui-build + - ui-build runs-on: windows-2022 @@ -708,11 +708,11 @@ jobs: Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -782,7 +782,7 @@ jobs: ubuntu-24-sycl: needs: - - webui-build + - ui-build strategy: matrix: @@ -831,11 +831,11 @@ jobs: wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -877,7 +877,7 @@ jobs: ubuntu-22-rocm: needs: - - webui-build + - ui-build runs-on: ubuntu-22.04 @@ -895,11 +895,11 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: Free up disk space uses: ggml-org/free-disk-space@v1.3.1 @@ -989,7 +989,7 @@ jobs: windows-hip: needs: - - webui-build + - ui-build runs-on: windows-2022 @@ -1007,11 +1007,11 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: Grab rocWMMA package id: grab_rocwmma @@ -1259,7 +1259,7 @@ jobs: runs-on: ubuntu-slim needs: - - webui-build + - ui-build - windows - windows-cpu - windows-cuda @@ -1404,14 +1404,14 @@ jobs: } } - webui-publish: + ui-publish: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} needs: - release - uses: ./.github/workflows/webui-publish.yml + uses: ././.github/workflows/ui-publish.yml with: version_tag: ${{ needs.release.outputs.tag_name }} secrets: - hf_token: ${{ secrets.HF_TOKEN_WEBUI_STATIC_OUTPUT }} + hf_token: ${{ secrets.HF_TOKEN_UI_STATIC_OUTPUT }} diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index 117c79c2f87..15a0b7df87c 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -39,12 +39,12 @@ concurrency: cancel-in-progress: true jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml + ui-build: + name: Build UI + uses: ./.github/workflows/ui-build.yml server-metal: - needs: webui-build + needs: ui-build runs-on: [self-hosted, llama-server, macOS, ARM64] name: server-metal (${{ matrix.wf_name }}) @@ -72,11 +72,11 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: Build id: cmake_build diff --git a/.github/workflows/server-webui.yml b/.github/workflows/server-ui.yml similarity index 75% rename from .github/workflows/server-webui.yml rename to .github/workflows/server-ui.yml index 72c2016efe7..8ab61e96cda 100644 --- a/.github/workflows/server-webui.yml +++ b/.github/workflows/server-ui.yml @@ -1,4 +1,4 @@ -name: Server WebUI +name: Server UI on: workflow_dispatch: @@ -11,15 +11,15 @@ on: branches: - master paths: [ - '.github/workflows/server-webui.yml', - 'tools/server/webui/**.*', + '.github/workflows/server-ui.yml', + 'tools/ui/**.*', 'tools/server/tests/**.*' ] pull_request: types: [opened, synchronize, reopened] paths: [ - '.github/workflows/server-webui.yml', - 'tools/server/webui/**.*', + '.github/workflows/server-ui.yml', + 'tools/ui/**.*', 'tools/server/tests/**.*' ] @@ -34,13 +34,13 @@ concurrency: cancel-in-progress: true jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml + ui-build: + name: Build UI + uses: ./.github/workflows/ui-build.yml - webui-checks: - name: WebUI Checks - needs: webui-build + ui-checks: + name: UI Checks + needs: ui-build runs-on: ubuntu-24.04-arm continue-on-error: true steps: @@ -56,43 +56,43 @@ jobs: with: node-version: "24" cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install dependencies id: setup if: ${{ steps.node.conclusion == 'success' }} run: npm ci - working-directory: tools/server/webui + working-directory: tools/ui - name: Run type checking if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run check - working-directory: tools/server/webui + working-directory: tools/ui - name: Run linting if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run lint - working-directory: tools/server/webui + working-directory: tools/ui - name: Install Playwright browsers id: playwright if: ${{ always() && steps.setup.conclusion == 'success' }} run: npx playwright install --with-deps - working-directory: tools/server/webui + working-directory: tools/ui - name: Run Client tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:client - working-directory: tools/server/webui + working-directory: tools/ui - name: Run Unit tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:unit - working-directory: tools/server/webui + working-directory: tools/ui e2e-tests: name: E2E Tests - needs: webui-build + needs: ui-build runs-on: ubuntu-24.04-arm steps: - name: Checkout code @@ -107,36 +107,36 @@ jobs: with: node-version: "24" cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install dependencies id: setup if: ${{ steps.node.conclusion == 'success' }} run: npm ci - working-directory: tools/server/webui + working-directory: tools/ui - name: Build application if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run build - working-directory: tools/server/webui + working-directory: tools/ui - name: Install Playwright browsers id: playwright if: ${{ always() && steps.setup.conclusion == 'success' }} run: npx playwright install --with-deps - working-directory: tools/server/webui + working-directory: tools/ui - name: Build Storybook if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run build-storybook - working-directory: tools/server/webui + working-directory: tools/ui - name: Run UI tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:ui -- --testTimeout=60000 - working-directory: tools/server/webui + working-directory: tools/ui - name: Run E2E tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:e2e - working-directory: tools/server/webui + working-directory: tools/ui diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index f12e7b7366f..a5f8fef5983 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -54,12 +54,12 @@ concurrency: cancel-in-progress: true jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml + ui-build: + name: Build UI + uses: ./.github/workflows/ui-build.yml server: - needs: webui-build + needs: ui-build runs-on: ubuntu-latest name: server (${{ matrix.wf_name }}) @@ -98,11 +98,11 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: Build id: cmake_build @@ -136,7 +136,7 @@ jobs: SLOW_TESTS=1 pytest -v -x server-windows: - needs: webui-build + needs: ui-build runs-on: windows-2022 steps: @@ -147,11 +147,11 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: Build id: cmake_build diff --git a/.github/workflows/webui-build.yml b/.github/workflows/ui-build.yml similarity index 67% rename from .github/workflows/webui-build.yml rename to .github/workflows/ui-build.yml index 7f512a69d3d..fd60e1d36dc 100644 --- a/.github/workflows/webui-build.yml +++ b/.github/workflows/ui-build.yml @@ -1,11 +1,11 @@ -name: Build WebUI +name: Build UI on: workflow_call: jobs: build: - name: Build WebUI + name: Build UI runs-on: ubuntu-slim env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} @@ -19,26 +19,26 @@ jobs: with: node-version: "24" cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install dependencies run: npm ci - working-directory: tools/server/webui + working-directory: tools/ui - name: Build application run: npm run build - working-directory: tools/server/webui + working-directory: tools/ui - name: Generate checksums run: | - cd tools/server/public + cd build/tools/ui for f in *; do sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt done - - name: Upload built webui + - name: Upload built UI uses: actions/upload-artifact@v6 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ retention-days: 1 diff --git a/.github/workflows/webui-publish.yml b/.github/workflows/ui-publish.yml similarity index 78% rename from .github/workflows/webui-publish.yml rename to .github/workflows/ui-publish.yml index bf0d707a23e..42ca5c6b3a4 100644 --- a/.github/workflows/webui-publish.yml +++ b/.github/workflows/ui-publish.yml @@ -1,4 +1,4 @@ -name: WebUI Publish +name: UI Publish on: workflow_call: @@ -14,14 +14,14 @@ on: jobs: publish: - name: Publish WebUI Static Output + name: Publish UI Static Output runs-on: ubuntu-24.04-arm permissions: contents: read env: - HF_BUCKET_NAME: ${{ vars.HF_BUCKET_WEBUI_STATIC_OUTPUT }} + HF_BUCKET_NAME: ${{ vars.HF_BUCKET_UI_STATIC_OUTPUT }} steps: - name: Checkout code @@ -29,11 +29,11 @@ jobs: with: fetch-depth: 1 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/ - name: Install Hugging Face Hub CLI run: pip install -U huggingface_hub @@ -44,12 +44,12 @@ jobs: - name: Sync built files to Hugging Face bucket (version tag) run: | # Upload the built files to the Hugging Face bucket under the release version - hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet + hf buckets sync build/tools/ui hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet - name: Sync built files to Hugging Face bucket (latest) run: | # Also upload to the 'latest' directory for fallback downloads - hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet + hf buckets sync build/tools/ui hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet - name: Verify upload run: | diff --git a/.gitignore b/.gitignore index 5f53fbf7a61..4af901fe926 100644 --- a/.gitignore +++ b/.gitignore @@ -54,7 +54,6 @@ /tmp/ /autogen-*.md /common/build-info.cpp -/tools/server/public # Deprecated diff --git a/CMakeLists.txt b/CMakeLists.txt index 244f4cb4982..44746072313 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,8 +108,23 @@ option(LLAMA_BUILD_TESTS "llama: build tests" option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON) -option(LLAMA_USE_PREBUILT_WEBUI "llama: use prebuilt WebUI from HF Bucket when available (requires LLAMA_BUILD_WEBUI=ON)" ON) +# Deprecated: use LLAMA_BUILD_UI instead (kept for backward compat) +option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server (deprecated: use LLAMA_BUILD_UI)" ON) +option(LLAMA_USE_PREBUILT_WEBUI "llama: use prebuilt WebUI from HF Bucket when available (deprecated: use LLAMA_USE_PREBUILT_UI)" ON) + +# New option names +option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON) +option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON) + +# Backward compat: when old var is set but new one isn't, forward the value +if(DEFINED LLAMA_BUILD_WEBUI AND NOT DEFINED LLAMA_BUILD_UI) + set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI}) + message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead") +endif() +if(DEFINED LLAMA_USE_PREBUILT_WEBUI AND NOT DEFINED LLAMA_USE_PREBUILT_UI) + set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI}) + message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead") +endif() option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) option(LLAMA_TESTS_INSTALL "llama: install tests" ON) diff --git a/CODEOWNERS b/CODEOWNERS index a4395969fe3..f58f0f830fa 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -15,7 +15,7 @@ # ggml-org/llama-common : ggerganov, aldehir, angt, danbev, ngxson, pwilkin # ggml-org/llama-mtmd : ngxson # ggml-org/llama-server : ggerganov, ngxson, allozaur, angt, ServeurpersoCom -# ggml-org/llama-webui : allozaur +# ggml-org/llama-ui : allozaur /.devops/*.Dockerfile @ngxson /.github/actions/ @ggml-org/ci @@ -107,7 +107,7 @@ /tools/rpc/ @ggml-org/ggml-rpc /tools/server/* @ggml-org/llama-server # no subdir /tools/server/tests/ @ggml-org/llama-server -/tools/server/webui/ @ggml-org/llama-webui +/tools/ui/ @ggml-org/llama-ui /tools/tokenize/ @ggerganov /tools/tts/ @ggerganov /vendor/ @ggerganov diff --git a/README.md b/README.md index de93f17eb77..a0c14b9d7f0 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) +- Ruby: [docusealco/rllama](https://github.com/docusealco/rllama) - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) diff --git a/common/arg.cpp b/common/arg.cpp index 15d5ad77af1..2129a9c7266 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2844,28 +2844,64 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.api_prefix = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); + // Deprecated: use --ui-config instead (kept for backward compat) add_opt(common_arg( {"--webui-config"}, "JSON", - "JSON that provides default WebUI settings (overrides WebUI defaults)", + "[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)", [](common_params & params, const std::string & value) { + params.ui_config_json = value; params.webui_config_json = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG")); + + add_opt(common_arg( + {"--ui-config"}, "JSON", + "JSON that provides default UI settings (overrides UI defaults)", + [](common_params & params, const std::string & value) { + params.ui_config_json = value; + params.webui_config_json = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG")); + + // Deprecated: use --ui-config-file instead (kept for backward compat) add_opt(common_arg( {"--webui-config-file"}, "PATH", - "JSON file that provides default WebUI settings (overrides WebUI defaults)", + "[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)", [](common_params & params, const std::string & value) { - params.webui_config_json = read_file(value); + params.ui_config_json = read_file(value); + params.webui_config_json = params.ui_config_json; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE")); + + add_opt(common_arg( + {"--ui-config-file"}, "PATH", + "JSON file that provides default UI settings (overrides UI defaults)", + [](common_params & params, const std::string & value) { + params.ui_config_json = read_file(value); + params.webui_config_json = params.ui_config_json; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE")); + + // Deprecated: use --ui-mcp-proxy instead (kept for backward compat) add_opt(common_arg( {"--webui-mcp-proxy"}, {"--no-webui-mcp-proxy"}, - string_format("experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: %s)", params.webui_mcp_proxy ? "enabled" : "disabled"), + "[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy", [](common_params & params, bool value) { + params.ui_mcp_proxy = value; params.webui_mcp_proxy = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY")); + + add_opt(common_arg( + {"--ui-mcp-proxy"}, + {"--no-ui-mcp-proxy"}, + "experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)", + [](common_params & params, bool value) { + params.ui_mcp_proxy = value; + params.webui_mcp_proxy = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY")); add_opt(common_arg( {"--tools"}, "TOOL1,TOOL2,...", "experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n" @@ -2875,14 +2911,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.server_tools = parse_csv_row(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS")); + // Deprecated: use --ui/--no-ui instead (kept for backward compat) add_opt(common_arg( {"--webui"}, {"--no-webui"}, - string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), + "[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI", [](common_params & params, bool value) { + params.ui = value; params.webui = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI")); + + add_opt(common_arg( + {"--ui"}, + {"--no-ui"}, + string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.ui = value; + params.webui = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), diff --git a/common/common.h b/common/common.h index 764574e23a0..c6223c4b515 100644 --- a/common/common.h +++ b/common/common.h @@ -604,15 +604,23 @@ struct common_params { std::map default_template_kwargs; - // webui configs -#ifdef LLAMA_WEBUI_DEFAULT_ENABLED - bool webui = LLAMA_WEBUI_DEFAULT_ENABLED != 0; + // UI configs +#ifdef LLAMA_UI_DEFAULT_ENABLED + bool ui = LLAMA_UI_DEFAULT_ENABLED != 0; +#elif defined(LLAMA_WEBUI_DEFAULT_ENABLED) + bool ui = LLAMA_WEBUI_DEFAULT_ENABLED != 0; #else - bool webui = true; // default to enabled when not set + bool ui = true; // default to enabled when not set #endif + + // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead + bool webui = ui; bool webui_mcp_proxy = false; std::string webui_config_json; + bool ui_mcp_proxy = false; + std::string ui_config_json; + // "advanced" endpoints are disabled by default for better security bool endpoint_slots = true; bool endpoint_props = false; // only control POST requests, not GET diff --git a/common/reasoning-budget.cpp b/common/reasoning-budget.cpp index c6e1f86c91e..958c9cacf51 100644 --- a/common/reasoning-budget.cpp +++ b/common/reasoning-budget.cpp @@ -171,22 +171,12 @@ static void common_reasoning_budget_reset(struct llama_sampler * smpl) { ctx->force_pos = 0; } -// forward declaration for use in clone static struct llama_sampler * common_reasoning_budget_init_state( const struct llama_vocab * vocab, const std::vector & start_tokens, const std::vector & end_tokens, const std::vector & forced_tokens, int32_t budget, common_reasoning_budget_state initial_state); -static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx; - return common_reasoning_budget_init_state( - ctx->vocab, - ctx->start_matcher.tokens, - ctx->end_matcher.tokens, - ctx->forced_tokens, - ctx->budget, - ctx->state); -} +static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl); static void common_reasoning_budget_free(struct llama_sampler * smpl) { delete (common_reasoning_budget_ctx *) smpl->ctx; @@ -205,6 +195,15 @@ static struct llama_sampler_i common_reasoning_budget_i = { /* .backend_set_input = */ nullptr, }; +static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx; + + return llama_sampler_init( + /* .iface = */ &common_reasoning_budget_i, + /* .ctx = */ new common_reasoning_budget_ctx(*ctx) + ); +} + static struct llama_sampler * common_reasoning_budget_init_state( const struct llama_vocab * vocab, const std::vector & start_tokens, diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index a2948273512..e833070eee9 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -44,6 +44,7 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f GRADER_PATTERNS = { "aime": r'\boxed{(\d+)}|\b(\d+)\b', "aime2025": r'\boxed{(\d+)}|\b(\d+)\b', + "aime2026": r'\boxed{(\d+)}|\b(\d+)\b', "gsm8k": r'\b(\d+)\b', } @@ -58,6 +59,11 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f "-123", "999" ], + "aime2026": [ + "42", + "-123", + "999" + ], "gsm8k": [ "42", "-123", @@ -81,6 +87,12 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f {question} +Remember to put your answer inside \\boxed{{}}. +""", + "aime2026": """Solve the following math problem step by step. Put your answer inside \\boxed{{}}. + +{question} + Remember to put your answer inside \\boxed{{}}. """, "gsm8k": """{question} @@ -166,6 +178,8 @@ def load_dataset(self, seed: int = 1234): self.dataset = AimeDataset() elif self.dataset_type == "aime2025": self.dataset = Aime2025Dataset() + elif self.dataset_type == "aime2026": + self.dataset = Aime2026Dataset() elif self.dataset_type == "gsm8k": self.dataset = Gsm8kDataset() elif self.dataset_type == "gpqa": @@ -679,6 +693,47 @@ def get_prompt(self, question: Dict) -> str: question=self.get_question_text(question), ) +class Aime2026Dataset(BaseDataset): + def __init__(self): + self.questions = [] + self._load_dataset() + + def _load_dataset(self): + print(f"Loading AIME2026 dataset...") + from datasets import load_dataset + + cache_path = cache_dir / "MathArena___aime_2026" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = load_dataset("MathArena/aime_2026", "default", split="train", cache_dir=str(cache_path)) + else: + ds = load_dataset("MathArena/aime_2026", "default", split="train") + + self.questions = [] + for row in ds: + question = dict(row) + question["dataset_type"] = "aime2026" + self.questions.append(question) + + print(f"AIME2026 dataset loaded: {len(self.questions)} questions") + + def get_question(self, index: int) -> Dict: + """Get question by index""" + return self.questions[index] + + def get_question_text(self, question: Dict) -> str: + """Get question string""" + return question["problem"] + + def get_answer(self, question: Dict) -> str: + return str(question["answer"]) + + def get_prompt(self, question: Dict) -> str: + """Get formatted prompt for the question""" + return TEMPLATE_REGISTRY["aime2026"].format( + question=self.get_question_text(question), + ) + class Gsm8kDataset(BaseDataset): def __init__(self, split: str = "test"): self.split = split @@ -1188,7 +1243,7 @@ def main(): "--dataset", type=str, default="aime", - choices=["aime", "aime2025", "gsm8k", "gpqa"], + choices=["aime", "aime2025", "aime2026", "gsm8k", "gpqa"], help="Dataset type (default: aime)" ) parser.add_argument( diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 43e22c5e5ee..a25e912c4d2 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -125,61 +125,107 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co } static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_rdna(const int DKQ, const int DV, const int ncols) { - GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 128, 2, 64, 128, 128, 128, 2, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 64, 128, 128, 64, 2, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 64, 128, 128, 64, 2, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 8, 128, 2, 64, 32, 32, 32, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 16, 128, 2, 64, 32, 32, 32, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 32, 128, 2, 64, 32, 32, 32, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 64, 128, 2, 64, 32, 32, 32, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 64, 160, 128, 64, 2, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 128, 2, 64, 160, 128, 64, 2, false); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 8, 64, 2, 32, 40, 40, 40, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 16, 64, 2, 32, 40, 40, 40, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 32, 128, 2, 64, 40, 40, 40, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 64, 128, 2, 64, 40, 40, 40, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 128, 128, 128, 1, false); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 256, 1, 32, 128, 128, 128, 1, false); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 8, 64, 2, 32, 48, 48, 48, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 16, 64, 2, 32, 48, 48, 48, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 32, 128, 2, 64, 48, 48, 48, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 64, 128, 2, 64, 48, 48, 48, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16, 64, 4, 32, 96, 64, 128, 1, false); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2, 32, 160, 128, 128, 1, false); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1, 32, 160, 128, 128, 1, false); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 8, 64, 2, 32, 56, 56, 56, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 16, 64, 2, 32, 56, 56, 56, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 32, 128, 2, 64, 56, 56, 56, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 64, 128, 2, 64, 56, 56, 56, 1, true); - // TODO tune specifically for RDNA - return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 8, 64, 2, 32, 64, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 16, 64, 2, 32, 64, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2, 64, 64, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 128, 2, 64, 64, 64, 64, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 8, 64, 2, 32, 96, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16, 64, 2, 32, 96, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 128, 2, 64, 96, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 128, 2, 64, 96, 64, 64, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 8, 64, 2, 32, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 64, 2, 32, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 64, 128, 128, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 64, 128, 128, 64, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 32, 160, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 128, 2, 32, 160, 128, 128, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 8, 128, 3, 64, 96, 64, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 128, 3, 64, 96, 64, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 128, 2, 32, 128, 128, 128, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 8, 128, 3, 64, 96, 64, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16, 128, 3, 64, 96, 64, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2, 32, 160, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 128, 2, 32, 160, 128, 128, 1, true); + + return fattn_mma_config(32, 1, 0, 0, 0, 0, 0, false); } static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_cdna(const int DKQ, const int DV, const int ncols) { - // Conservative configs for CDNA (MI100+): 64KB LDS, wavefront64, nstages=1 (no cp.async). - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 8, 128, 2, 128, 32, 32, 32, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 16, 128, 2, 64, 32, 32, 32, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 32, 128, 2, 64, 32, 32, 32, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 64, 256, 2, 64, 32, 32, 32, 1, true); - - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 8, 128, 2, 128, 40, 40, 40, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 16, 128, 2, 64, 40, 40, 40, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 32, 128, 2, 64, 40, 40, 40, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 8, 128, 1, 64, 32, 32, 32, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 16, 256, 2, 64, 32, 32, 32, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 32, 32, 32, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64, 64, 64, 256, 4, 64, 32, 32, 32, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 8, 256, 2, 64, 40, 40, 40, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 16, 256, 2, 64, 40, 40, 40, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 32, 256, 2, 64, 40, 40, 40, 1, true); GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80, 80, 64, 256, 2, 64, 40, 40, 40, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 8, 128, 2, 128, 48, 48, 48, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 16, 128, 2, 64, 48, 48, 48, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 32, 128, 2, 64, 48, 48, 48, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 8, 256, 2, 64, 48, 48, 48, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 16, 256, 2, 64, 48, 48, 48, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 32, 256, 2, 64, 48, 48, 48, 1, true); GGML_CUDA_FATTN_MMA_CONFIG_CASE( 96, 96, 64, 256, 2, 64, 48, 48, 48, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 8, 128, 2, 128, 56, 56, 56, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 16, 128, 2, 64, 56, 56, 56, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 32, 128, 2, 64, 56, 56, 56, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 8, 256, 2, 64, 56, 56, 56, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 16, 256, 2, 64, 56, 56, 56, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 32, 256, 2, 64, 56, 56, 56, 1, true); GGML_CUDA_FATTN_MMA_CONFIG_CASE(112, 112, 64, 256, 2, 64, 56, 56, 56, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 8, 128, 2, 128, 64, 64, 64, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 16, 128, 2, 64, 64, 64, 64, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2, 64, 64, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 8, 256, 2, 64, 64, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 16, 256, 2, 64, 64, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64, 64, 64, 1, true); GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 256, 2, 64, 64, 64, 64, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 8, 64, 4, 64, 128, 128, 128, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 64, 4, 32, 128, 128, 128, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 32, 128, 128, 128, 1, true); - GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 256, 2, 32, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 8, 256, 1, 64, 64, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16, 256, 1, 64, 64, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 256, 1, 64, 64, 64, 64, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 512, 1, 64, 64, 64, 64, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 8, 256, 1, 64, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 256, 1, 64, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 256, 1, 64, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 512, 1, 64, 128, 128, 64, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 256, 1, 64, 160, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1, 64, 160, 128, 128, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 8, 256, 1, 64, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 256, 1, 64, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 256, 1, 64, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 256, 1, 64, 128, 128, 128, 1, true); + + GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 8, 256, 1, 64, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16, 256, 1, 64, 128, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 256, 1, 64, 160, 128, 128, 1, true); + GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1, 64, 160, 128, 128, 1, true); - // Fallback for unsupported DKQ values (e.g. 576). Must return non-zero values to satisfy - // compile-time static_asserts even though the kernel guard prevents runtime execution. - // nthreads=256 gives nwarps=4 (warp_size=64) or 8 (warp_size=32), nbatch_fa=128 satisfies np*16 divisibility. - return fattn_mma_config(256, 1, 128, 4, 4, 4, 1, false); + return fattn_mma_config(32, 1, 0, 0, 0, 0, 0, false); } static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, const int DV, const int ncols, const int cc) { @@ -510,7 +556,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( const int jt, const int kb0, const int k_VKQ_sup) { -#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE) +#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr int ncols = ncols1 * ncols2; constexpr int cols_per_warp = T_B_KQ::I; @@ -712,6 +758,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( #pragma unroll for (int i00 = 0; i00 < nbatch_fa; i00 += np*T_C_KQ::J) { const int i0 = i00 + (threadIdx.y % np)*T_C_KQ::J; + + // The mask is stored as 16 bit half values, loading them as 32 bit half2 values is preferred in terms of speed. + // However, this is not possible for RDNA3 where 2 consecutive l indices are not consecutive in the mask memory layout. +#ifdef RDNA3 +#pragma unroll + for (int l = 0; l < T_C_KQ::ne; ++l) { + const int i = i0 + T_C_KQ::get_j(l); + const int j = ((threadIdx.y / np)*cols_per_warp + T_C_KQ::get_i(l)) / ncols2; + + KQ_C[i00/(np*T_C_KQ::J)].x[l] += __half2float(tile_mask[j*(nbatch_fa + 8) + i]); + } +#else #pragma unroll for (int l0 = 0; l0 < T_C_KQ::ne; l0 += 2) { const int i = (i0 + T_C_KQ::get_j(l0)) / 2; @@ -721,6 +779,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( KQ_C[i00/(np*T_C_KQ::J)].x[l0 + 0] += slope*tmp.x; KQ_C[i00/(np*T_C_KQ::J)].x[l0 + 1] += slope*tmp.y; } +#endif // RDNA3 } } @@ -827,13 +886,23 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( } } #elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) - const half2 KQ_max_scale_h2 = make_half2( - KQ_max_scale[0], KQ_max_scale[0]); + if constexpr (std::is_same_v) { + const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]); #pragma unroll - for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) { + for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) { #pragma unroll - for (int l = 0; l < T_C_VKQ::ne; ++l) { - VKQ_C[i].x[l] *= KQ_max_scale_h2; + for (int l = 0; l < T_C_VKQ::ne; ++l) { + VKQ_C[i].x[l] *= KQ_max_scale_h2; + } + } + } else { + static_assert(std::is_same_v, "bad VKQ type"); +#pragma unroll + for (int i = 0; i < DV/T_C_VKQ::J; ++i) { +#pragma unroll + for (int l = 0; l < T_C_VKQ::ne; ++l) { + VKQ_C[i].x[l] *= KQ_max_scale[0]; + } } } #else // Volta @@ -901,9 +970,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( const half2 * tile_V_i = !V_is_K_view || i0_stop > 2*nbatch_K2 ? tile_V : tile_V + i0_start/2; #if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) - constexpr int i0_stride = cols_per_warp == 8 ? T_C_VKQ::I : 2*T_C_VKQ::J; #pragma unroll - for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += i0_stride) { + for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += T_A_VKQ::I) { static_assert((nbatch_fa/2) % (np*T_A_VKQ::J) == 0, "bad loop size"); #pragma unroll for (int k00 = 0; k00 < nbatch_fa/2; k00 += np*T_A_VKQ::J) { @@ -912,15 +980,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load. load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V); if constexpr (T_B_KQ::I == 8) { - mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]); + mma(VKQ_C[i_VKQ_0/T_A_VKQ::I], A, B[k00/(np*T_A_VKQ::J)]); } else { // Wide version of VKQ_C is column-major. #if defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) // AMD matrix C is column-major. - mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]); + mma(VKQ_C[i_VKQ_0/T_A_VKQ::I], A, B[k00/(np*T_A_VKQ::J)]); #else // swap A and B for CUDA. - mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::J)], A); + mma(VKQ_C[i_VKQ_0/T_A_VKQ::I], B[k00/(np*T_A_VKQ::J)], A); #endif // defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) } } @@ -953,11 +1021,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0); NO_DEVICE_CODE; -#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE) +#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) } #if defined(TURING_MMA_AVAILABLE) -template struct mma_tile_sizes { +template struct mma_tile_sizes { using T_A_KQ = tile<16, 8, half2>; // row-major using T_B_KQ = tile<16, 8, half2>; // column-major using T_C_KQ = tile<16, 16, float>; // column-major @@ -965,7 +1033,7 @@ template struct mma_tile_sizes { using T_B_VKQ = tile<16, 8, half2>; // column-major using T_C_VKQ = tile<16, 8, half2>; // column-major }; -template<> struct mma_tile_sizes<8> { +template struct mma_tile_sizes { using T_A_KQ = tile<16, 8, half2>; // row-major using T_B_KQ = tile< 8, 8, half2>; // column-major using T_C_KQ = tile<16, 8, float>; // row-major @@ -973,8 +1041,60 @@ template<> struct mma_tile_sizes<8> { using T_B_VKQ = tile< 8, 8, half2>; // column-major using T_C_VKQ = tile<16, 4, half2>; // row-major }; -#elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) -template struct mma_tile_sizes { +#elif defined(AMD_WMMA_AVAILABLE) +#ifdef RDNA3 +template struct mma_tile_sizes { + using T_A_KQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major + using T_B_KQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major + using T_C_KQ = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>; // column-major + using T_A_VKQ = tile<32, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major + using T_B_VKQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major + using T_C_VKQ = tile<16, 16, half2, DATA_LAYOUT_I_MAJOR>; // column-major +}; +template struct mma_tile_sizes<80, ncols> { + using T_A_KQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major + using T_B_KQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major + using T_C_KQ = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>; // column-major + using T_A_VKQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major + using T_B_VKQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major + using T_C_VKQ = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>; // column-major +}; +template struct mma_tile_sizes<112, ncols> { + using T_A_KQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major + using T_B_KQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major + using T_C_KQ = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>; // column-major + using T_A_VKQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major + using T_B_VKQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // column-major + using T_C_VKQ = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>; // column-major +}; +#else +template struct mma_tile_sizes { + using T_A_KQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR>; // row-major + using T_B_KQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR>; // column-major + using T_C_KQ = tile<16, 16, float, DATA_LAYOUT_I_MAJOR>; // column-major + using T_A_VKQ = tile<32, 8, half2, DATA_LAYOUT_I_MAJOR>; // row-major + using T_B_VKQ = tile<16, 8, half2, DATA_LAYOUT_I_MAJOR>; // column-major + using T_C_VKQ = tile<16, 16, half2, DATA_LAYOUT_I_MAJOR_SCRAMBLED>; // column-major +}; +template struct mma_tile_sizes<80, ncols> { + using T_A_KQ = tile<16, 8, half2>; // row-major + using T_B_KQ = tile<16, 8, half2>; // column-major + using T_C_KQ = tile<16, 16, float>; // column-major + using T_A_VKQ = tile<16, 8, half2>; // row-major + using T_B_VKQ = tile<16, 8, half2>; // column-major + using T_C_VKQ = tile<16, 8, half2>; // column-major +}; +template struct mma_tile_sizes<112, ncols> { + using T_A_KQ = tile<16, 8, half2>; // row-major + using T_B_KQ = tile<16, 8, half2>; // column-major + using T_C_KQ = tile<16, 16, float>; // column-major + using T_A_VKQ = tile<16, 8, half2>; // row-major + using T_B_VKQ = tile<16, 8, half2>; // column-major + using T_C_VKQ = tile<16, 8, half2>; // column-major +}; +#endif // RDNA3 +#elif defined(AMD_MFMA_AVAILABLE) +template struct mma_tile_sizes { using T_A_KQ = tile<16, 8, half2>; // row-major using T_B_KQ = tile<16, 8, half2>; // column-major using T_C_KQ = tile<16, 16, float>; // column-major @@ -983,7 +1103,7 @@ template struct mma_tile_sizes { using T_C_VKQ = tile<16, 8, half2>; // column-major }; #else // Volta -template struct mma_tile_sizes { +template struct mma_tile_sizes { using T_A_KQ = tile< 8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major using T_B_KQ = tile<32, 4, half2, DATA_LAYOUT_I_MAJOR>; // column-major using T_C_KQ = tile<32, 8, float, DATA_LAYOUT_I_MAJOR>; // column-major @@ -1018,17 +1138,17 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const int zt_gqa, const int kb0_start, const int kb0_stop) { -#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE) +#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) //In this kernel Q, K, V are matrices while i, j, k are matrix indices. constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr int ncols = ncols1 * ncols2; - using T_A_KQ = typename mma_tile_sizes::T_A_KQ; - using T_B_KQ = typename mma_tile_sizes::T_B_KQ; - using T_C_KQ = typename mma_tile_sizes::T_C_KQ; - using T_A_VKQ = typename mma_tile_sizes::T_A_VKQ; - using T_B_VKQ = typename mma_tile_sizes::T_B_VKQ; - using T_C_VKQ = typename mma_tile_sizes::T_C_VKQ; + using T_A_KQ = typename mma_tile_sizes::T_A_KQ; + using T_B_KQ = typename mma_tile_sizes::T_B_KQ; + using T_C_KQ = typename mma_tile_sizes::T_C_KQ; + using T_A_VKQ = typename mma_tile_sizes::T_A_VKQ; + using T_B_VKQ = typename mma_tile_sizes::T_B_VKQ; + using T_C_VKQ = typename mma_tile_sizes::T_C_VKQ; constexpr int cols_per_warp = T_B_KQ::I; constexpr int cols_per_thread = get_cols_per_thread(); @@ -1061,6 +1181,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( T_B_KQ Q_B[(Q_in_reg ? DKQ/(2*T_B_KQ::J) : 1)]; #if defined(TURING_MMA_AVAILABLE) T_C_VKQ VKQ_C[cols_per_warp == 8 ? DV/T_C_VKQ::I : DV/(2*T_C_VKQ::J)]; +#elif defined(AMD_WMMA_AVAILABLE) && defined(RDNA3) + T_C_VKQ VKQ_C[DV % 32 != 0 ? DV/T_C_VKQ::J : DV/(2*T_C_VKQ::J)]; #elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) T_C_VKQ VKQ_C[ DV/(2*T_C_VKQ::J)]; #else // Volta @@ -1269,12 +1391,23 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( } } #elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) - const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]); + if constexpr (std::is_same_v) { + const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]); #pragma unroll - for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) { + for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) { #pragma unroll - for (int l = 0; l < T_C_VKQ::ne; ++l) { - VKQ_C[i].x[l] *= KQ_max_scale_h2; + for (int l = 0; l < T_C_VKQ::ne; ++l) { + VKQ_C[i].x[l] *= KQ_max_scale_h2; + } + } + } else { + static_assert(std::is_same_v, "bad VKQ type"); +#pragma unroll + for (int i = 0; i < DV/T_C_VKQ::J; ++i) { +#pragma unroll + for (int l = 0; l < T_C_VKQ::ne; ++l) { + VKQ_C[i].x[l] *= KQ_max_scale[0]; + } } } #else // Volta @@ -1433,6 +1566,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( #pragma unroll for (int k00 = 0; k00 < DV/2; k00 += nbatch_combine) { if constexpr (cols_per_warp == 8) { + static_assert(std::is_same_v, "bad VKQ type"); const int jc_cwd = threadIdx.y*T_B_KQ::I + T_B_KQ::get_i(-1); // jc combine write data #pragma unroll for (int k1 = 0; k1 < nbatch_combine; k1 += T_B_KQ::J) { @@ -1447,14 +1581,45 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( } } else { const int j0 = threadIdx.y*cols_per_warp; + if constexpr (std::is_same_v) { + if constexpr (T_C_VKQ::dl == DATA_LAYOUT_I_MAJOR) { #pragma unroll - for (int k1 = 0; k1 < nbatch_combine; k1 += T_C_VKQ::J) { + for (int k1 = 0; k1 < nbatch_combine; k1 += T_C_VKQ::J) { #pragma unroll - for (int l = 0; l < T_C_VKQ::ne; ++l) { - const int j = j0 + T_C_VKQ::get_i(l); - const int k = k1 + T_C_VKQ::get_j(l); + for (int l = 0; l < T_C_VKQ::ne; ++l) { + const int j = j0 + T_C_VKQ::get_i(l); + const int k = k1 + T_C_VKQ::get_j(l); - tile_Q[j*tile_stride + k] = VKQ_C[(k00 + k1)/T_C_VKQ::J].x[l]; + tile_Q[j*tile_stride + k] = VKQ_C[(k00 + k1)/T_C_VKQ::J].x[l]; + } + } + } else { + static_assert(T_C_VKQ::dl == DATA_LAYOUT_I_MAJOR_SCRAMBLED, "bad T_C_VKQ data layout"); + using T_C_VKQ_us = tile; // us == unscrambled +#pragma unroll + for (int k1 = 0; k1 < nbatch_combine; k1 += T_C_VKQ::J) { + const T_C_VKQ_us VKQ_C_us = unscramble(VKQ_C[(k00 + k1)/T_C_VKQ::J]); +#pragma unroll + for (int l = 0; l < T_C_VKQ_us::ne; ++l) { + const int j = j0 + T_C_VKQ_us::get_i(l); + const int k = k1 + T_C_VKQ_us::get_j(l); + + tile_Q[j*tile_stride + k] = VKQ_C_us.x[l]; + } + } + } + } else { + static_assert(std::is_same_v, "bad VKQ type"); + half * tile_Q_h = (half *) tile_Q; +#pragma unroll + for (int k1 = 0; k1 < nbatch_combine; k1 += T_C_VKQ::J/2) { +#pragma unroll + for (int l = 0; l < T_C_VKQ::ne; ++l) { + const int j = j0 + T_C_VKQ::get_i(l); + const int k = 2*k1 + T_C_VKQ::get_j(l); + + tile_Q_h[j*(2*tile_stride) + k] = VKQ_C[(k00 + k1)/(T_C_VKQ::J/2)].x[l]; + } } } } @@ -1532,7 +1697,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start, kb0_stop); NO_DEVICE_CODE; -#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE) +#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) } template @@ -1559,7 +1724,7 @@ static __global__ void flash_attn_ext_f16( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { -#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)) +#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)) // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(DKQ == 128 || DKQ == 256 || DKQ == 512)) { @@ -1585,14 +1750,14 @@ static __global__ void flash_attn_ext_f16( #endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING #if defined(AMD_WMMA_AVAILABLE) - if (ncols1*ncols2 > 32 || ncols1*ncols2 < 16 || DKQ > 128 || ncols2 == 1) { + if (ncols1*ncols2 < 16 || ncols2 == 1 || DKQ > 128) { NO_DEVICE_CODE; return; } #endif // defined(AMD_WMMA_AVAILABLE) #if defined(AMD_MFMA_AVAILABLE) - if (DKQ != 64 && DKQ != 80 && DKQ != 96 && DKQ != 112 && DKQ != 128) { + if (ncols1*ncols2 < 16 || DKQ > 256) { NO_DEVICE_CODE; return; } @@ -1715,7 +1880,7 @@ static __global__ void flash_attn_ext_f16( ne31, ne32, ne33, nb31, nb32, nb33); NO_DEVICE_CODE; -#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) || defined(AMD_MFMA_AVAILABLE)) +#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)) } template diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index e045b04f727..1c7777e8a71 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -19,13 +19,14 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con } if constexpr (ncols2 <= 16) { - if ((turing_mma_available(cc) || amd_wmma_available(cc)) && Q->ne[1] <= 16/ncols2) { + if (Q->ne[1] <= 16/ncols2) { ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); return; } } - if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || amd_wmma_available(cc) || Q->ne[1] <= 32/ncols2) { + if (Q->ne[1] <= 32/ncols2 || (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING) || + (GGML_CUDA_CC_IS_AMD(cc) && DKQ > 256)) { ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); return; } @@ -477,12 +478,13 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const return BEST_FATTN_KERNEL_MMA_F16; } + const int ncols2_max = Q->ne[0] == 320 ? 32 : ((Q->ne[0] == 576 || Q->ne[0] == 192) ? 16 : 8); + int gqa_ratio_eff = 1; + while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) { + gqa_ratio_eff *= 2; + } + if (volta_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) { - int gqa_ratio_eff = 1; - const int ncols2_max = (Q->ne[0] == 576 || Q->ne[0] == 192) ? 16 : 8; - while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) { - gqa_ratio_eff *= 2; - } if (can_use_vector_kernel && Q->ne[1] * gqa_ratio_eff <= 2) { return BEST_FATTN_KERNEL_VEC; } @@ -500,41 +502,22 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const return BEST_FATTN_KERNEL_WMMA_F16; } - if (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc) && gqa_opt_applies && Q->ne[0] <= 128 && Q->ne[0] != 40 && Q->ne[0] != 72) { - if (can_use_vector_kernel) { - if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) { - if (Q->ne[1] == 1) { - if (!gqa_opt_applies) { - return BEST_FATTN_KERNEL_VEC; - } - } - } else { - if (Q->ne[1] <= 2) { - return BEST_FATTN_KERNEL_VEC; - } - } + // AMD MFMA needs a certain minimum batch size to outscale the tile kernel for large head sizes. + if ((amd_mfma_available(cc) && Q->ne[0] <= 256) && Q->ne[0] != 40 && Q->ne[0] != 72) { + if ((Q->ne[0] <= 64 && Q->ne[1] * gqa_ratio_eff > 8)) { + return BEST_FATTN_KERNEL_MMA_F16; } - int gqa_ratio_eff = 1; - const int ncols2_max = Q->ne[0] == 576 ? 16 : 8; - while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) { - gqa_ratio_eff *= 2; + if ((Q->ne[0] <= 128 && Q->ne[1] * gqa_ratio_eff > 16)) { + return BEST_FATTN_KERNEL_MMA_F16; } - if (Q->ne[1] * gqa_ratio_eff <= 8) { - return BEST_FATTN_KERNEL_TILE; // AMD WMMA is only faster if the full tile width of 16 can be utilized. + if ((Q->ne[0] <= 256 && Q->ne[1] * gqa_ratio_eff > 64)) { + return BEST_FATTN_KERNEL_MMA_F16; } - return BEST_FATTN_KERNEL_MMA_F16; } - // Use MFMA flash attention for CDNA (MI100+): - if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) { - const int64_t eff_nq = Q->ne[1] * (gqa_opt_applies ? gqa_ratio : 1); - // MMA vs tile crossover benchmarked on MI300X @ d32768: - // hsk=64 (gqa=4): MMA wins at eff >= 128 (+11%) - // hsk=128 (gqa=4): MMA wins at eff >= 128 (+4%) - if (eff_nq >= (GGML_CUDA_CC_IS_CDNA1(cc) && Q->ne[0] == 64 ? 64 : 128)) { - return BEST_FATTN_KERNEL_MMA_F16; - } - // Fall through to tile kernel for small effective batch sizes. + // AMD WMMA is always faster than the tile kernel if the full tile width of 16 can be utilized. + if ((amd_wmma_available(cc) && gqa_opt_applies && Q->ne[0] <= 128) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[1] * gqa_ratio_eff > 8) { + return BEST_FATTN_KERNEL_MMA_F16; } // If there are no tensor cores available, use the generic tile kernel: diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 79bb2934c5f..8d7c69dc3e8 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -80,6 +80,7 @@ namespace ggml_cuda_mma { DATA_LAYOUT_J_MAJOR = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3. DATA_LAYOUT_I_MAJOR_MIRRORED = 20, // Volta, matrix A&B for RDNA3. DATA_LAYOUT_J_MAJOR_MIRRORED = 30, + DATA_LAYOUT_I_MAJOR_SCRAMBLED = 40, // Scrambled matrix C for faster transposition (RDNA4/CDNA), convert to float to unscramble. }; // Implemented mma combinations are: // - (I_MAJOR, I_MAJOR) -> I_MAJOR @@ -312,13 +313,19 @@ namespace ggml_cuda_mma { half2 x[ne] = {{0.0f, 0.0f}}; static constexpr __device__ bool supported() { - if (I == 16 && J == 8) return true; + if (I == 16 && J == 8) return true; + if (I == 16 && J == 16) return true; + if (I == 32 && J == 8) return true; return false; } static __device__ __forceinline__ int get_i(const int l) { if constexpr (I == 16 && J == 8) { return threadIdx.x % 16; + } else if constexpr (I == 16 && J == 16) { + return threadIdx.x % 16; + } else if constexpr (I == 32 && J == 8) { + return (threadIdx.x % 16) * 2 + l / (ne/2); } else { NO_DEVICE_CODE; return -1; @@ -327,7 +334,15 @@ namespace ggml_cuda_mma { static __device__ __forceinline__ int get_j(const int l) { if constexpr (I == 16 && J == 8) { - return ne * (threadIdx.x / 16) + l; + return (threadIdx.x / 16) * ne + l; + } else if constexpr (I == 16 && J == 16) { +#ifdef RDNA3 + return l*2 + (threadIdx.x / 16); +#else + return (threadIdx.x / 16) * ne + l; +#endif // RDNA3 + } else if constexpr (I == 32 && J == 8) { + return (threadIdx.x / 16) * (ne/2) + l % (ne/2); } else { NO_DEVICE_CODE; return -1; @@ -338,13 +353,19 @@ namespace ggml_cuda_mma { half2 x[ne] = {{0.0f, 0.0f}}; static constexpr __device__ bool supported() { - if (I == 16 && J == 8) return true; + if (I == 16 && J == 8) return true; + if (I == 16 && J == 16) return true; + if (I == 32 && J == 8) return true; return false; } static __device__ __forceinline__ int get_i(const int l) { if constexpr (I == 16 && J == 8) { return threadIdx.x % 16; + } else if constexpr (I == 16 && J == 16) { + return threadIdx.x % 16; + } else if constexpr (I == 32 && J == 8) { + return (threadIdx.x % 16) * 2 + l / (ne/2); } else { NO_DEVICE_CODE; return -1; @@ -353,7 +374,11 @@ namespace ggml_cuda_mma { static __device__ __forceinline__ int get_j(const int l) { if constexpr (I == 16 && J == 8) { - return ne * (threadIdx.x / 16) + l; + return (threadIdx.x / 16) * ne + l; + } else if constexpr (I == 16 && J == 16) { + return (threadIdx.x / 16) * ne + l; + } else if constexpr (I == 32 && J == 8) { + return (threadIdx.x / 16) * (ne/2) + l % (ne/2); } else { NO_DEVICE_CODE; return -1; @@ -516,12 +541,15 @@ namespace ggml_cuda_mma { if (I == 16 && J == 16) return true; if (I == 16 && J == 8) return true; if (I == 16 && J == 4) return true; + if (I == 32 && J == 8) return true; return false; } - static __device__ __forceinline__ int get_i(const int /*l*/) { - if constexpr (supported()) { + static __device__ __forceinline__ int get_i(const int l) { + if constexpr (I == 16) { return threadIdx.x % 16; + } else if constexpr (I == 32) { + return (threadIdx.x % 16) * 2 + l / (ne/2); } else { NO_DEVICE_CODE; return -1; @@ -529,8 +557,10 @@ namespace ggml_cuda_mma { } static __device__ __forceinline__ int get_j(const int l) { - if constexpr (supported()) { + if constexpr (I == 16) { return l; + } else if constexpr (I == 32) { + return l % (ne/2); } else { NO_DEVICE_CODE; return -1; @@ -644,6 +674,40 @@ namespace ggml_cuda_mma { } }; + template + struct tile { + static constexpr int I = I_; + static constexpr int J = J_; + static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_SCRAMBLED; + + static constexpr int ne = I * J / ggml_cuda_get_physical_warp_size(); + half2 x[ne] = {{0.0f, 0.0f}}; + + static constexpr __device__ bool supported() { + if (I == 16 && J == 16) return true; + return false; + } + + static __device__ __forceinline__ int get_i(const int l) { + return tile::get_i(l); + } + }; + + static __device__ __forceinline__ tile<16, 16, half2, DATA_LAYOUT_I_MAJOR> unscramble(const tile<16, 16, half2, DATA_LAYOUT_I_MAJOR_SCRAMBLED> & t) { +#if defined(AMD_MFMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) + tile<16, 16, half2, DATA_LAYOUT_I_MAJOR> ret; +#pragma unroll + for (int l0 = 0; l0 < t.ne/2; ++l0) { + ret.x[2*l0 + 0] = __lows2half2(t.x[l0], t.x[l0 + t.ne/2]); + ret.x[2*l0 + 1] = __highs2half2(t.x[l0], t.x[l0 + t.ne/2]); + } + return ret; +#else + NO_DEVICE_CODE; + GGML_UNUSED(t); +#endif // defined(AMD_MFMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) + } + #if defined(TURING_MMA_AVAILABLE) template static __device__ __forceinline__ tile get_half2(const tile & tile_float) { @@ -660,6 +724,21 @@ namespace ggml_cuda_mma { ret.x[0] = ggml_cuda_movmatrix(t.x[0]); ret.x[1] = ggml_cuda_movmatrix(t.x[1]); + return ret; + } +#elif defined(AMD_WMMA_AVAILABLE) && defined(RDNA3) + static __device__ __forceinline__ tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> get_half2( + const tile<16, 16, float, DATA_LAYOUT_I_MAJOR> & tile_float) { + tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> ret; +#pragma unroll + for (int l = 0; l < tile_float.ne; ++l) { + float tmp[2]; + int i = threadIdx.x / 16; + tmp[i] = tile_float.x[l]; + i ^= 1; + tmp[i] = __shfl_xor_sync(0xFFFFFFFF, tile_float.x[l], 16, WARP_SIZE); + ret.x[l] = make_half2(tmp[0], tmp[1]); + } return ret; } #elif defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE) @@ -802,21 +881,35 @@ namespace ggml_cuda_mma { #endif // defined(VOLTA_MMA_AVAILABLE) } - template + template static __device__ __forceinline__ void load_ldmatrix_trans( - tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) { + tile & t, const T * __restrict__ xs0, const int stride) { #ifdef TURING_MMA_AVAILABLE + static_assert(I == 16, "bad tile width"); + static_assert(dl == DATA_LAYOUT_I_MAJOR, "bad data layout"); int * xi = (int *) t.x; const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2); asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.b16 {%0, %1, %2, %3}, [%4];" : "=r"(xi[0]), "=r"(xi[2]), "=r"(xi[1]), "=r"(xi[3]) : "l"(xs)); #elif defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) - half * xh = (half *) t.x; + static_assert(dl == DATA_LAYOUT_I_MAJOR || dl == DATA_LAYOUT_I_MAJOR_MIRRORED, "bad data layout"); + if constexpr (I == 32) { #pragma unroll - for (int l = 0; l < t.ne; ++l) { - xh[2*l + 0] = ((const half *) xs0)[(2*t.get_j(l) + 0)*(2*stride) + t.get_i(l)]; - xh[2*l + 1] = ((const half *) xs0)[(2*t.get_j(l) + 1)*(2*stride) + t.get_i(l)]; + for (int l0 = 0; l0 < t.ne/2; ++l0) { + const half2 tmp0 = xs0[(2*t.get_j(l0) + 0)*stride + t.get_i(l0)/2]; + const half2 tmp1 = xs0[(2*t.get_j(l0) + 1)*stride + t.get_i(l0)/2]; + + t.x[l0] = __lows2half2(tmp0, tmp1); + t.x[l0 + t.ne/2] = __highs2half2(tmp0, tmp1); + } + } else { + half * xh = (half *) t.x; +#pragma unroll + for (int l = 0; l < t.ne; ++l) { + xh[2*l + 0] = ((const half *) xs0)[(2*t.get_j(l) + 0)*(2*stride) + t.get_i(l)]; + xh[2*l + 1] = ((const half *) xs0)[(2*t.get_j(l) + 1)*(2*stride) + t.get_i(l)]; + } } #else GGML_UNUSED_VARS(t, xs0, stride); @@ -972,6 +1065,20 @@ namespace ggml_cuda_mma { #endif // TURING_MMA_AVAILABLE } + static __device__ __forceinline__ void mma( + tile<16, 16, half2, DATA_LAYOUT_I_MAJOR_SCRAMBLED> & D, const tile<32, 8, half2, DATA_LAYOUT_I_MAJOR> & A, + const tile<16, 8, half2, DATA_LAYOUT_I_MAJOR> & B) { +#if defined(AMD_MFMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) + tile<16, 8, half2> * D16 = (tile<16, 8, half2> *) &D; + const tile<16, 8, half2> * A16 = (const tile<16, 8, half2> *) &A; + mma(D16[0], A16[0], B); + mma(D16[1], A16[1], B); +#else + GGML_UNUSED_VARS(D, A, B); + NO_DEVICE_CODE; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) && defined(RDNA4) + } + template static __device__ __forceinline__ void mma( tile<16, 8, float, dl_d> & D, const tile<16, 8, float, dl_ab> & A, const tile<8, 8, float, dl_ab> & B) { @@ -1296,6 +1403,22 @@ namespace ggml_cuda_mma { #endif // defined(VOLTA_MMA_AVAILABLE) } + static __device__ __forceinline__ void mma( + tile<16, 16, half2, DATA_LAYOUT_I_MAJOR> & D, const tile<32, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & A, + const tile<16, 8, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) { +#if defined(AMD_WMMA_AVAILABLE) && defined(RDNA3) + using halfx16_t = __attribute__((ext_vector_type(16))) _Float16; + halfx16_t * xD = (halfx16_t *) D.x; + const halfx16_t * xA = (const halfx16_t *) A.x; + const halfx16_t * xB = (const halfx16_t *) B.x; + xD[0] = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(xA[0], xB[0], xD[0], /*opsel =*/ 0); + xD[0] = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(xA[1], xB[0], xD[0], /*opsel =*/ 1); +#else + GGML_UNUSED_VARS(D, A, B); + NO_DEVICE_CODE; +#endif // TURING_MMA_AVAILABLE + } + template static __device__ __forceinline__ void mma( tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) { diff --git a/ggml/src/ggml-hexagon/htp/cpy-ops.c b/ggml/src/ggml-hexagon/htp/cpy-ops.c index e5b9d350fd7..5c040a32224 100644 --- a/ggml/src/ggml-hexagon/htp/cpy-ops.c +++ b/ggml/src/ggml-hexagon/htp/cpy-ops.c @@ -88,6 +88,29 @@ static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp const uint32_t ir0 = dr * ith; const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; + // Fast path: when both src0 and dst are contiguous in memory + // Replace the element-by-element loop with a single bulk HVX copy per (i03, i02) slice. + const bool src0_contig = (nb00 == ct->src0_type_size) && + (nb01 == ne00 * nb00) && + (nb02 == ne01 * nb01) && + (nb03 == ne02 * nb02); + const bool dst_contig = (nb0 == ct->dst_type_size) && + (nb1 == ne0 * nb0) && + (nb2 == ne1 * nb1) && + (nb3 == ne2 * nb2); + + if (src0_contig && dst_contig) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01; + uint32_t flat = ((i03*ne02 + i02)*ne01 + ir0) * ne00; + uint8_t * dst_ptr = (uint8_t *) dst->data + flat * ct->src0_type_size; + hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ct->src0_type_size); + } + } + return; + } + // dst counters int64_t k10 = 0; int64_t i11 = 0; diff --git a/scripts/snapdragon/qdc/run_qdc_jobs.py b/scripts/snapdragon/qdc/run_qdc_jobs.py index b4eede3d019..f1b0453eec4 100644 --- a/scripts/snapdragon/qdc/run_qdc_jobs.py +++ b/scripts/snapdragon/qdc/run_qdc_jobs.py @@ -1,4 +1,4 @@ -"""Run llama.cpp Hexagon Android tests in a single QDC Appium job. +"""Run llama.cpp Hexagon tests in a single QDC job. Bundles test scripts into one artifact and submits a single QDC job: @@ -10,6 +10,10 @@ Prerequisites: pip install /path/to/qualcomm_device_cloud_sdk*.whl +Platform is inferred from --device: + android Appium + pytest (Android phones: SM8750 / SM8650 / SM8850) + linux BASH (Linux IoT: QCS9075M) + Required environment variables: QDC_API_KEY API key from QDC UI -> Users -> Settings -> API Keys @@ -23,6 +27,7 @@ from __future__ import annotations import argparse +import enum import logging import os import re @@ -30,15 +35,35 @@ import sys import tempfile import time +import urllib.request import xml.etree.ElementTree as ET from dataclasses import dataclass, field from pathlib import Path +from typing import Callable + +from qualcomm_device_cloud_sdk.api import qdc_api +from qualcomm_device_cloud_sdk.logging import configure_logging +from qualcomm_device_cloud_sdk.models import ( + ArtifactType, + JobMode, + JobState, + JobSubmissionParameter, + JobType, + TestFramework, +) -from qualcomm_device_cloud_sdk.api import qdc_api # ty: ignore[unresolved-import] -from qualcomm_device_cloud_sdk.logging import configure_logging # ty: ignore[unresolved-import] -from qualcomm_device_cloud_sdk.models import ArtifactType, JobMode, JobState, JobSubmissionParameter, JobType, TestFramework # ty: ignore[unresolved-import] - +# configure_logging only sets up the SDK logger; basicConfig is needed for +# our own log.info to reach stdout. +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], +) configure_logging(level=logging.INFO, handlers=[logging.StreamHandler()]) +# Silence per-poll GET/status spam from the SDK and its HTTP client. +logging.getLogger("qualcomm_device_cloud").setLevel(logging.WARNING) +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("httpcore").setLevel(logging.WARNING) log = logging.getLogger(__name__) POLL_INTERVAL = 30 @@ -47,23 +72,56 @@ CAPACITY_TIMEOUT = 1800 CAPACITY_POLL = 60 MAX_CONCURRENT_JOBS = 5 +DEFAULT_RETRIES = 0 +RETRY_DELAY = 300 TERMINAL_STATES = {JobState.COMPLETED, JobState.CANCELED} NON_TERMINAL_STATES = {JobState.DISPATCHED, JobState.RUNNING, JobState.SETUP, JobState.SUBMITTED} -_SCRIPTS_DIR = Path(__file__).parent -_TESTS_DIR = _SCRIPTS_DIR / "tests" -_RUN_BENCH = _TESTS_DIR / "run_bench_tests_posix.py" -_RUN_BACKEND_OPS = _TESTS_DIR / "run_backend_ops_posix.py" -_UTILS = _TESTS_DIR / "utils.py" -_CONFTEST = _TESTS_DIR / "conftest.py" -_REQUIREMENTS = _SCRIPTS_DIR / "requirements.txt" +class DeviceUnavailableError(Exception): + """Raised when the QDC device resource is not available (retryable).""" + + +_SCRIPTS_DIR = Path(__file__).parent +_TESTS_DIR = _SCRIPTS_DIR / "tests" + +# --- Shared test assets ------------------------------------------------------- +_UTILS = _TESTS_DIR / "utils.py" +_CONFTEST = _TESTS_DIR / "conftest.py" _PYTEST_LINE_RE = re.compile( r"(?:[\w/]+\.py::)?(?:\w+::)?([\w\[\].-]+)\s+(PASSED|FAILED|ERROR|SKIPPED)" ) -_EXCLUDED_LOGS = {"qdc_android_whole_host-000.log", "qdc_kernel_host-000.log"} +_EXCLUDED_LOGS = { + "qdc_android_whole_host-000.log", + "qdc_kernel_host-000.log", + "qdc_LE_whole_host-000.log", + "qdc_LE_kernel_host-000.log", + "script.log", +} _NON_TERMINAL_STATE_VALUES = {s.value for s in NON_TERMINAL_STATES} +# --- Android (Appium + pytest) assets ---------------------------------------- +_RUN_BENCH = _TESTS_DIR / "run_bench_tests_posix.py" +_RUN_BACKEND_OPS = _TESTS_DIR / "run_backend_ops_posix.py" +_REQUIREMENTS = _SCRIPTS_DIR / "requirements.txt" +_UPSTREAM_ADB_SCRIPTS = ( + "https://raw.githubusercontent.com/ggml-org/llama.cpp/master/scripts/snapdragon/adb" +) +_ADB_SCRIPT_NAMES = [ + "run-bench.sh", + "run-cli.sh", + "run-completion.sh", + "run-tool.sh", +] + +# --- Linux (BASH) assets ------------------------------------------------------ +_RUN_LINUX_TEMPLATE = _TESTS_DIR / "linux" / "run_linux.sh" +_LINUX_ENTRY_SCRIPT = "/bin/bash /data/local/tmp/TestContent/run_linux.sh" + +# ============================================================================= +# Artifact builders (per platform) +# ============================================================================= + @dataclass class JobResult: @@ -73,35 +131,58 @@ class JobResult: failure_details: dict[str, str] = field(default_factory=dict) -def build_artifact_zip( +def _write_lf(path: Path, content: str) -> None: + """Write text with LF line endings (required by /bin/bash on Linux).""" + with open(path, "w", encoding="utf-8", newline="\n") as f: + f.write(content) + + +def _build_android_artifact( pkg_dir: Path, stage_dir: Path, - *, - test_mode: str = "bench", - model_url: str | None = None, + test_mode: str, + model_url: str | None, ) -> Path: - """Bundle everything into a single QDC artifact zip. + """Android zip (Appium/pytest). Extracted by QDC under /qdc/appium/. - Zip structure (extracted by QDC to /qdc/appium/ on the runner): + Zip structure: llama_cpp_bundle/ installed package (adb pushed to /data/local/tmp/) + run-{bench,cli,completion,tool}.sh upstream adb wrappers (patched) tests/ - utils.py shared helpers (paths, run_adb_command, …) - conftest.py shared pytest fixtures (driver) - test_bench_posix.py bench + cli tests (<> substituted) - AND/OR - test_backend_ops_posix.py test-backend-ops -b HTP0 + utils.py shared adb helpers + conftest.py Appium pytest fixtures + test_bench_posix.py bench + cli tests (for --test bench or all) + test_backend_ops_posix.py test-backend-ops on HTP0 requirements.txt + pytest.ini addopts = --junitxml=results.xml """ - shutil.copytree(pkg_dir, stage_dir / "llama_cpp_bundle") + bundle_dir = stage_dir / "llama_cpp_bundle" + shutil.copytree(pkg_dir, bundle_dir) + + # Download upstream adb scripts so they land at /qdc/appium/ on the QDC + # runner. They wrap `adb shell` internally. Patch in `chmod +x bin/* lib/*` + # right after `cd $basedir` so device binaries are executable. + for name in _ADB_SCRIPT_NAMES: + url = f"{_UPSTREAM_ADB_SCRIPTS}/{name}" + dest = stage_dir / name + log.info("Downloading %s", url) + urllib.request.urlretrieve(url, str(dest)) + content = dest.read_text() + content = content.replace( + "cd $basedir;", + "cd $basedir; chmod +x bin/* lib/* 2>/dev/null;", + ) + dest.write_text(content) + dest.chmod(0o755) tests_dir = stage_dir / "tests" tests_dir.mkdir() - shutil.copy(_UTILS, tests_dir / "utils.py") + shutil.copy(_UTILS, tests_dir / "utils.py") shutil.copy(_CONFTEST, tests_dir / "conftest.py") if test_mode in ("bench", "all"): - assert model_url is not None, "--model-url is required for bench/all test modes" + assert model_url is not None (tests_dir / "test_bench_posix.py").write_text( _RUN_BENCH.read_text().replace("<>", model_url) ) @@ -109,33 +190,140 @@ def build_artifact_zip( shutil.copy(_RUN_BACKEND_OPS, tests_dir / "test_backend_ops_posix.py") shutil.copy(_REQUIREMENTS, stage_dir / "requirements.txt") - (stage_dir / "pytest.ini").write_text("[pytest]\naddopts = --junitxml=results.xml\n") + (stage_dir / "pytest.ini").write_text( + "[pytest]\naddopts = --junitxml=results.xml\n" + ) zip_base = str(stage_dir / "artifact") shutil.make_archive(zip_base, "zip", stage_dir) return Path(f"{zip_base}.zip") +def _build_linux_artifact( + pkg_dir: Path, + stage_dir: Path, + test_mode: str, + model_url: str | None, +) -> Path: + """Linux IoT zip (BASH framework). Extracted by QDC to /data/local/tmp/TestContent/. + + Zip structure: + run_linux.sh entry script (placeholder-substituted, LF line endings) + llama_cpp_bundle/ installed package + """ + bundle_dir = stage_dir / "llama_cpp_bundle" + shutil.copytree(pkg_dir, bundle_dir) + + template = _RUN_LINUX_TEMPLATE.read_text(encoding="utf-8") + rendered = template.replace("{MODEL_URL}", model_url or "").replace( + "{TEST_MODE}", test_mode + ) + script_path = stage_dir / "run_linux.sh" + _write_lf(script_path, rendered) + script_path.chmod(0o755) + + zip_base = str(stage_dir / "artifact") + shutil.make_archive(zip_base, "zip", stage_dir) + return Path(f"{zip_base}.zip") + + +# ============================================================================= +# Platform enum + strategy table +# ============================================================================= + + +class Platform(enum.Enum): + ANDROID = "android" + LINUX = "linux" + + +@dataclass(frozen=True) +class PlatformSpec: + test_framework: TestFramework + entry_script: str | None + build_artifact: Callable[[Path, Path, str, str | None], Path] + job_name_fmt: str + + +PLATFORM_SPECS: dict[Platform, PlatformSpec] = { + Platform.ANDROID: PlatformSpec( + test_framework=TestFramework.APPIUM, + entry_script=None, + build_artifact=_build_android_artifact, + job_name_fmt="{base}", + ), + Platform.LINUX: PlatformSpec( + test_framework=TestFramework.BASH, + entry_script=_LINUX_ENTRY_SCRIPT, + build_artifact=_build_linux_artifact, + job_name_fmt="{base} (Linux)", + ), +} + +DEVICE_PLATFORM: dict[str, Platform] = { + "SM8750": Platform.ANDROID, + "SM8650": Platform.ANDROID, + "SM8850": Platform.ANDROID, + "QCS9075M": Platform.LINUX, +} + + +# ============================================================================= +# Shared QDC job plumbing +# ============================================================================= + + def wait_for_job(client, job_id: str, timeout: int) -> str: elapsed = 0 + last_state = None + consecutive_errors = 0 + max_consecutive_errors = 5 while elapsed < timeout: - raw = qdc_api.get_job_status(client, job_id) + try: + raw = qdc_api.get_job_status(client, job_id) + consecutive_errors = 0 + except Exception as e: + consecutive_errors += 1 + log.warning( + "Transient error polling job %s (%d/%d): %s", + job_id, + consecutive_errors, + max_consecutive_errors, + e, + ) + if consecutive_errors >= max_consecutive_errors: + raise + time.sleep(POLL_INTERVAL) + elapsed += POLL_INTERVAL + continue try: status = JobState(raw) except ValueError: status = raw if status in TERMINAL_STATES: return raw.lower() - log.info("Job %s: %s", job_id, raw) + if raw != last_state: + log.info("Job %s: %s", job_id, raw) + last_state = raw time.sleep(POLL_INTERVAL) elapsed += POLL_INTERVAL + # Abort to free the QDC concurrency slot instead of leaking it. + try: + qdc_api.abort_job(client, job_id) + log.warning("Aborted job %s after timeout to free concurrency slot", job_id) + except Exception as e: + log.warning("Failed to abort job %s: %s", job_id, e) raise TimeoutError(f"Job {job_id} did not finish within {timeout}s") def wait_for_log_upload(client, job_id: str) -> None: elapsed = 0 while elapsed <= LOG_UPLOAD_TIMEOUT: - status = (qdc_api.get_job_log_upload_status(client, job_id) or "").lower() + try: + status = (qdc_api.get_job_log_upload_status(client, job_id) or "").lower() + except Exception as e: + log.warning("get_job_log_upload_status failed: %s — will retry", e) + status = "" if status in {"completed", "failed"}: return log.info("Waiting for log upload (status=%s) ...", status) @@ -150,17 +338,33 @@ def wait_for_capacity(client, max_jobs: int = MAX_CONCURRENT_JOBS) -> None: while elapsed < CAPACITY_TIMEOUT: jobs_page = qdc_api.get_jobs_list(client, page_number=0, page_size=50) if jobs_page is None: - log.warning("Could not retrieve job list; proceeding without capacity check") + log.warning( + "Could not retrieve job list; proceeding without capacity check" + ) return items = getattr(jobs_page, "data", []) or [] - active = sum(1 for j in items if getattr(j, "state", None) in _NON_TERMINAL_STATE_VALUES) + active = sum( + 1 for j in items if getattr(j, "state", None) in _NON_TERMINAL_STATE_VALUES + ) if active < max_jobs: log.info("Active QDC jobs: %d / %d — proceeding", active, max_jobs) return - log.info("Active QDC jobs: %d / %d — waiting %ds ...", active, max_jobs, CAPACITY_POLL) + log.info( + "Active QDC jobs: %d / %d — waiting %ds ...", + active, + max_jobs, + CAPACITY_POLL, + ) time.sleep(CAPACITY_POLL) elapsed += CAPACITY_POLL - log.warning("Capacity wait timed out after %ds; proceeding anyway", CAPACITY_TIMEOUT) + raise TimeoutError( + f"Capacity wait timed out after {CAPACITY_TIMEOUT}s" + ) + + +# --------------------------------------------------------------------------- +# Log parsing helpers +# --------------------------------------------------------------------------- def _parse_junit_xml(content: str) -> tuple[dict[str, bool], dict[str, str]]: @@ -192,10 +396,26 @@ def _parse_pytest_output(content: str) -> dict[str, bool]: def fetch_logs_and_parse_tests( - client, job_id: str + client, job_id: str, max_retries: int = 5, retry_delay: int = 30 ) -> tuple[dict[str, bool], dict[str, str], dict[str, str]]: """Returns (test_results, raw_logs, failure_details).""" - log_files = qdc_api.get_job_log_files(client, job_id) + log_files = None + for attempt in range(1, max_retries + 1): + try: + log_files = qdc_api.get_job_log_files(client, job_id) + break + except Exception as e: + if attempt < max_retries: + log.warning( + "get_job_log_files failed (attempt %d/%d): %s — retrying in %ds", + attempt, max_retries, e, retry_delay, + ) + time.sleep(retry_delay) + else: + log.error( + "get_job_log_files failed after %d attempts: %s", max_retries, e + ) + return {}, {}, {} if not log_files: log.warning("No log files returned for job %s", job_id) return {}, {}, {} @@ -207,8 +427,8 @@ def fetch_logs_and_parse_tests( with tempfile.TemporaryDirectory() as tmpdir: for lf in log_files: - log.info("Downloading log file: %s", lf.filename) zip_path = os.path.join(tmpdir, "log.zip") + log.info("Downloading log file: %s", lf.filename) qdc_api.download_job_log_files(client, lf.filename, zip_path) try: shutil.unpack_archive(zip_path, tmpdir, "zip") @@ -226,12 +446,15 @@ def fetch_logs_and_parse_tests( elif fname.endswith(".log"): if fname in _EXCLUDED_LOGS: continue - log.info("--- %s ---", fname) - log.info("%s", content) + log.info("--- %s ---\n%s", fname, content) raw_logs[fname] = content pytest_fallback.update(_parse_pytest_output(content)) - return (test_results if test_results else pytest_fallback), raw_logs, failure_details + return ( + (test_results if test_results else pytest_fallback), + raw_logs, + failure_details, + ) def write_summary(result: JobResult, title: str = "QDC Test Results") -> None: @@ -289,30 +512,106 @@ def write_summary(result: JobResult, title: str = "QDC Test Results") -> None: f.write("\n".join(lines) + "\n") +# ============================================================================= +# CLI + main +# ============================================================================= + def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) - p.add_argument("--pkg-dir", required=True, type=Path, + p.add_argument("--pkg-dir", required=True, type=Path, help="Installed llama.cpp package directory (contains bin/ and lib/)") p.add_argument("--model-url", help="Direct URL to the GGUF model file (required for --test bench)") - p.add_argument("--device", required=True, + p.add_argument("--device", required=True, help="QDC chipset name, e.g. SM8750") p.add_argument("--test", choices=["bench", "backend-ops", "all"], default="bench", help="Test suite to run (default: bench)") p.add_argument("--job-timeout", type=int, default=JOB_TIMEOUT, metavar="SECONDS", help=f"Max seconds to wait for job completion (default: {JOB_TIMEOUT})") + p.add_argument("--retries", type=int, default=DEFAULT_RETRIES, metavar="N", + help="Number of retries when device is unavailable (default: 0)") + p.add_argument("--retry-delay", type=int, default=RETRY_DELAY, metavar="SECONDS", + help=f"Seconds to wait between retries (default: {RETRY_DELAY})") args = p.parse_args() if args.test in ("bench", "all") and not args.model_url: p.error("--model-url is required when --test bench or --test all") return args +def _submit_and_run_job(client, args, spec, target_id, artifact_id) -> JobResult: + """Submit a QDC job and wait for results. + + Raises DeviceUnavailableError for transient device/resource issues that + are worth retrying. Returns JobResult for definitive outcomes (pass or + test failure). + """ + try: + wait_for_capacity(client) + except TimeoutError: + raise DeviceUnavailableError("Capacity wait timed out — device busy") + + job_name = spec.job_name_fmt.format(base="llama.cpp Hexagon tests") + + job_id = qdc_api.submit_job( + public_api_client=client, + target_id=target_id, + job_name=job_name, + external_job_id=None, + job_type=JobType.AUTOMATED, + job_mode=JobMode.APPLICATION, + timeout=max(1, args.job_timeout // 60), + test_framework=spec.test_framework, + entry_script=spec.entry_script, + job_artifacts=[artifact_id], + monkey_events=None, + monkey_session_timeout=None, + job_parameters=[JobSubmissionParameter.WIFIENABLED], + ) + if job_id is None: + raise DeviceUnavailableError("Job submission failed — device may be unavailable") + log.info("Job submitted: %s (device=%s)", job_id, args.device) + + try: + job_status = wait_for_job(client, job_id, timeout=args.job_timeout) + except TimeoutError as e: + raise DeviceUnavailableError(str(e)) + log.info("Job %s finished: %s", job_id, job_status) + + wait_for_log_upload(client, job_id) + tests, raw_logs, failure_details = fetch_logs_and_parse_tests(client, job_id) + + job_ok = job_status == JobState.COMPLETED.value.lower() + + if not job_ok and not tests: + raise DeviceUnavailableError( + f"Job did not complete (status={job_status}) and produced no test results" + ) + + passed = job_ok and all(tests.values()) if tests else job_ok + if spec.test_framework == TestFramework.BASH and not tests: + log.error("No test results recovered (state=%s). Script likely never ran.", job_status) + passed = False + if not passed: + log.error("Job did not complete successfully or tests failed (status=%s)", job_status) + + return JobResult(passed=passed, tests=tests, raw_logs=raw_logs, failure_details=failure_details) + + def main() -> int: args = parse_args() + platform = DEVICE_PLATFORM.get(args.device) + if platform is None: + log.error( + "Unknown device %r. Known: %s", + args.device, ", ".join(sorted(DEVICE_PLATFORM.keys())), + ) + return 1 + spec = PLATFORM_SPECS[platform] + api_key = os.environ.get("QDC_API_KEY") if not api_key: log.error("QDC_API_KEY environment variable must be set") @@ -334,10 +633,9 @@ def main() -> int: return 1 with tempfile.TemporaryDirectory() as tmpdir: - log.info("Building artifact ...") - zip_path = build_artifact_zip( - args.pkg_dir, Path(tmpdir), - test_mode=args.test, model_url=args.model_url, + log.info("Building %s artifact (test=%s) ...", platform.value, args.test) + zip_path = spec.build_artifact( + args.pkg_dir, Path(tmpdir), args.test, args.model_url ) log.info("Uploading artifact (%d MB) ...", zip_path.stat().st_size // 1_000_000) artifact_id = qdc_api.upload_file(client, str(zip_path), ArtifactType.TESTSCRIPT) @@ -346,46 +644,31 @@ def main() -> int: log.error("Artifact upload failed") return 1 - wait_for_capacity(client) - - job_id = qdc_api.submit_job( - public_api_client=client, - target_id=target_id, - job_name="llama.cpp Hexagon tests", - external_job_id=None, - job_type=JobType.AUTOMATED, - job_mode=JobMode.APPLICATION, - timeout=max(1, args.job_timeout // 60), - test_framework=TestFramework.APPIUM, - entry_script=None, - job_artifacts=[artifact_id], - monkey_events=None, - monkey_session_timeout=None, - job_parameters=[JobSubmissionParameter.WIFIENABLED], - ) - if job_id is None: - log.error("Job submission failed") - return 1 - log.info("Job submitted: %s (device=%s)", job_id, args.device) - - try: - job_status = wait_for_job(client, job_id, timeout=args.job_timeout) - except TimeoutError as e: - log.error("%s", e) - write_summary(JobResult(passed=False, tests={}), title=f"QDC Job Timed Out ({args.device})") + max_attempts = 1 + args.retries + for attempt in range(1, max_attempts + 1): + try: + result = _submit_and_run_job(client, args, spec, target_id, artifact_id) + break + except DeviceUnavailableError as e: + if attempt < max_attempts: + log.warning( + "Attempt %d/%d failed (device unavailable): %s — retrying in %ds", + attempt, max_attempts, e, args.retry_delay, + ) + time.sleep(args.retry_delay) + else: + log.error( + "Attempt %d/%d failed (device unavailable): %s — no retries left", + attempt, max_attempts, e, + ) + write_summary( + JobResult(passed=False, tests={}), + title=f"QDC Device Unavailable ({args.device})", + ) + return 1 + else: return 1 - log.info("Job %s finished: %s", job_id, job_status) - - wait_for_log_upload(client, job_id) - tests, raw_logs, failure_details = fetch_logs_and_parse_tests(client, job_id) - - passed = job_status == JobState.COMPLETED.value.lower() - if tests: - passed = passed and all(tests.values()) - if not passed: - log.error("Job did not complete successfully or tests failed (status=%s)", job_status) - result = JobResult(passed=passed, tests=tests, raw_logs=raw_logs, failure_details=failure_details) if args.test == "backend-ops": title = f"Backend Ops — HTP0 ({args.device})" elif args.test == "all": @@ -394,7 +677,7 @@ def main() -> int: title = f"QDC Test Results ({args.device})" write_summary(result, title=title) - return 0 if passed else 1 + return 0 if result.passed else 1 if __name__ == "__main__": diff --git a/scripts/snapdragon/qdc/tests/linux/run_linux.sh b/scripts/snapdragon/qdc/tests/linux/run_linux.sh new file mode 100644 index 00000000000..a6abf8ec301 --- /dev/null +++ b/scripts/snapdragon/qdc/tests/linux/run_linux.sh @@ -0,0 +1,232 @@ +#!/bin/bash +# llama.cpp Hexagon test entry script for QDC Linux IoT (BASH framework). +# +# Placeholders substituted by run_qdc_jobs.py (--platform linux) before upload: +# {MODEL_URL} direct URL to a .gguf model file +# {TEST_MODE} bench | backend-ops | all +# +# QDC extracts the artifact zip to /data/local/tmp/TestContent/ and invokes +# this script via: /bin/bash /data/local/tmp/TestContent/run_linux.sh +# Any files written under /data/local/tmp/QDC_logs/ are auto-uploaded. + +set +e +umask 022 + +LOG_DIR=/data/local/tmp/QDC_logs +BUNDLE_DIR=/data/local/tmp/TestContent/llama_cpp_bundle +MODEL_DIR=/data/local/tmp/gguf +MODEL_PATH="$MODEL_DIR/model.gguf" +RESULTS_XML="$LOG_DIR/results.xml" + +mkdir -p "$LOG_DIR" "$MODEL_DIR" +# Redirect all parent-shell output to script.log so QDC auto-uploads it; +# per-case runs still capture their own stdout/stderr into dedicated logs. +exec > "$LOG_DIR/script.log" 2>&1 + +echo "=== env ===" +date -u +uname -a +pwd + +mount -o rw,remount / 2>/dev/null || true + +cd "$BUNDLE_DIR" || { echo "FATAL: bundle missing at $BUNDLE_DIR"; exit 1; } +chmod +x bin/* 2>/dev/null +export LD_LIBRARY_PATH="$BUNDLE_DIR/lib:$LD_LIBRARY_PATH" +export ADSP_LIBRARY_PATH="$BUNDLE_DIR/lib" +export GGML_HEXAGON_EXPERIMENTAL=1 + +echo "=== download model ===" +MODEL_URL="{MODEL_URL}" +if [ -z "$MODEL_URL" ]; then + echo "No model URL provided, skipping download" +elif [ ! -f "$MODEL_PATH" ]; then + curl -L -fS --retry 3 --retry-delay 5 -o "$MODEL_PATH" "$MODEL_URL" + curl_rc=$? + if [ $curl_rc -ne 0 ]; then + echo "FATAL: model download failed (rc=$curl_rc)" + exit 1 + fi + ls -la "$MODEL_PATH" +fi + +# --------------------------------------------------------------------------- +# JUnit XML helpers +# --------------------------------------------------------------------------- + +xml_open() { + printf '%s\n' \ + '' \ + "" \ + "" \ + > "$RESULTS_XML" +} + +xml_close() { + printf '%s\n' '' '' >> "$RESULTS_XML" +} + +xml_case_pass() { + local classname=$1 name=$2 + printf '\n' "$classname" "$name" >> "$RESULTS_XML" +} + +xml_case_fail() { + local classname=$1 name=$2 rc=$3 logfile=$4 + { + printf '\n' "$classname" "$name" + printf '/dev/null | sed 's/]]>/]] >/g' + printf '\n]]>\n\n' + } >> "$RESULTS_XML" +} + +# Map backend name -> "NDEV --device" pair. "none" means no offload (CPU). +backend_env() { + case "$1" in + cpu) echo "0 none" ;; + gpu) echo "0 GPUOpenCL" ;; + npu) echo "1 HTP0" ;; + esac +} + +backend_log_name() { + case "$1" in + cpu) echo "cpu" ;; + gpu) echo "gpu" ;; + npu) echo "htp" ;; + esac +} + + +backend_device_name() { + case "$1" in + cpu) echo "none" ;; + gpu) echo "GPUOpenCL" ;; + npu) echo "HTP0" ;; + esac +} + +# Append a diagnostic block when a per-case `timeout N` fires (rc=124). The +# naked log file at that point usually just ends mid-OpenCL-init with no +# stderr, which is hard to read in CI summaries. +note_timeout_if_triggered() { + local rc=$1 budget=$2 log=$3 + [ "$rc" -eq 124 ] || return 0 + { + printf '\n' + printf '=== TIMEOUT after %ss ===\n' "$budget" + printf 'uptime: '; uptime 2>/dev/null + printf 'free -m:\n'; free -m 2>/dev/null + printf 'loadavg: '; cat /proc/loadavg 2>/dev/null + } >> "$log" +} + +completion_extra_args() { + case "$1" in + cpu) echo "--device none --ctx-size 128 -no-cnv -n 32 --seed 42 --batch-size 128" ;; + gpu) echo "--device GPUOpenCL --ctx-size 128 -no-cnv -n 32 --seed 42 --ubatch-size 512" ;; + npu) echo "--device HTP0 --ctx-size 128 -no-cnv -n 32 --seed 42 --ubatch-size 1024" ;; + esac +} + +run_completion_case() { + local name=$1 + local parts=($(backend_env "$name")) + local ndev=${parts[0]} device=${parts[1]} + local device_log_name=$(backend_device_name "$name") + local log="$LOG_DIR/llama_completion_${device_log_name}.log" + local prompt="$LOG_DIR/bench_prompt.txt" + echo 'What is the capital of France?' > "$prompt" + local extra + extra=$(completion_extra_args "$name") + echo "=== [completion:$name] llama-completion --device $device (NDEV=$ndev) ===" + timeout 600 env GGML_HEXAGON_NDEV=$ndev ./bin/llama-completion \ + -m "$MODEL_PATH" \ + -f "$prompt" \ + $extra \ + > "$log" 2>&1 < /dev/null + local rc=$? + note_timeout_if_triggered "$rc" 600 "$log" + if [ $rc -eq 0 ]; then + xml_case_pass "tests.test_bench_posix" "test_llama_completion[$name]" + else + xml_case_fail "tests.test_bench_posix" "test_llama_completion[$name]" "$rc" "$log" + fi +} + +run_bench_case() { + local name=$1 + local parts=($(backend_env "$name")) + local ndev=${parts[0]} device=${parts[1]} + local log_suffix=$(backend_log_name "$name") + local log="$LOG_DIR/llama_bench_${log_suffix}.log" + echo "=== [bench:$name] llama-bench --device $device (NDEV=$ndev) ===" + timeout 600 env GGML_HEXAGON_NDEV=$ndev ./bin/llama-bench \ + -m "$MODEL_PATH" \ + --device "$device" \ + -ngl 99 \ + --batch-size 128 \ + -t 4 \ + -p 128 \ + -n 32 \ + > "$log" 2>&1 + local rc=$? + note_timeout_if_triggered "$rc" 600 "$log" + if [ $rc -eq 0 ]; then + xml_case_pass "tests.test_bench_posix" "test_llama_bench[$name]" + else + xml_case_fail "tests.test_bench_posix" "test_llama_bench[$name]" "$rc" "$log" + fi +} + +run_backend_ops_case() { + local dtype=$1 + local log="$LOG_DIR/backend_ops_${dtype}.log" + local pattern + case "$dtype" in + q4_0) + # Matches Android: exclude a known-broken shape on NPU. + pattern='^(?=.*type_a=q4_0)(?!.*type_b=f32,m=576,n=512,k=576).*$' + ;; + *) + pattern="type_a=${dtype}" + ;; + esac + echo "=== [backend-ops:$dtype] test-backend-ops -b HTP0 -o MUL_MAT ===" + timeout 600 env GGML_HEXAGON_NDEV=1 GGML_HEXAGON_HOSTBUF=0 ./bin/test-backend-ops \ + -b HTP0 -o MUL_MAT -p "$pattern" \ + > "$log" 2>&1 + local rc=$? + note_timeout_if_triggered "$rc" 600 "$log" + if [ $rc -eq 0 ]; then + xml_case_pass "tests.test_backend_ops_posix" "test_backend_ops_htp0[$dtype]" + else + xml_case_fail "tests.test_backend_ops_posix" "test_backend_ops_htp0[$dtype]" "$rc" "$log" + fi +} + +xml_open + +case "{TEST_MODE}" in + bench) + for b in cpu gpu npu; do run_completion_case "$b"; done + for b in cpu gpu npu; do run_bench_case "$b"; done + ;; + backend-ops) + for d in mxfp4 fp16 q4_0; do run_backend_ops_case "$d"; done + ;; + all) + for b in cpu gpu npu; do run_completion_case "$b"; done + for b in cpu gpu npu; do run_bench_case "$b"; done + for d in mxfp4 fp16 q4_0; do run_backend_ops_case "$d"; done + ;; + *) + echo "FATAL: unsupported TEST_MODE={TEST_MODE}" + ;; +esac + +xml_close +echo "=== done ===" +# Host parses results.xml to decide pass/fail. +exit 0 diff --git a/scripts/snapdragon/qdc/tests/run_backend_ops_posix.py b/scripts/snapdragon/qdc/tests/run_backend_ops_posix.py index 958fc074762..355bf6c6a5b 100644 --- a/scripts/snapdragon/qdc/tests/run_backend_ops_posix.py +++ b/scripts/snapdragon/qdc/tests/run_backend_ops_posix.py @@ -1,8 +1,9 @@ """ On-device test-backend-ops runner for llama.cpp (HTP0 backend). -Executed by QDC's Appium test framework on the QDC runner. +On Android: executed by QDC's Appium test framework on the QDC runner. The runner has ADB access to the allocated device. +On Linux: runs test-backend-ops directly via run_linux.sh (BASH framework). """ import os @@ -10,7 +11,12 @@ import pytest -from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log +from utils import ( + BIN_PATH, + push_bundle_if_needed, + run_script, + write_qdc_log, +) @pytest.fixture(scope="session", autouse=True) @@ -20,17 +26,21 @@ def install(driver): @pytest.mark.parametrize("type_a", ["mxfp4", "fp16", "q4_0"]) def test_backend_ops_htp0(type_a): - cmd = f"{CMD_PREFIX} GGML_HEXAGON_HOSTBUF=0 GGML_HEXAGON_EXPERIMENTAL=1 {BIN_PATH}/test-backend-ops -b HTP0 -o MUL_MAT" if type_a == "q4_0": - cmd += r' -p "^(?=.*type_a=q4_0)(?!.*type_b=f32,m=576,n=512,k=576).*$"' + pattern = r'^(?=.*type_a=q4_0)(?!.*type_b=f32,m=576,n=512,k=576).*$' else: - cmd += f" -p type_a={type_a}" - result = run_adb_command( - cmd, - check=False, + pattern = f"type_a={type_a}" + + quoted_pattern = f'"{pattern}"' if type_a == "q4_0" else pattern + result = run_script( + "run-tool.sh", + extra_env={"HB": "0"}, + extra_args=["test-backend-ops", "-b", "HTP0", "-o", "MUL_MAT", "-p", quoted_pattern], ) write_qdc_log(f"backend_ops_{type_a}.log", result.stdout or "") - assert result.returncode == 0, f"test-backend-ops type_a={type_a} failed (exit {result.returncode})" + assert result.returncode == 0, ( + f"test-backend-ops type_a={type_a} failed (exit {result.returncode})" + ) if __name__ == "__main__": diff --git a/scripts/snapdragon/qdc/tests/run_bench_tests_posix.py b/scripts/snapdragon/qdc/tests/run_bench_tests_posix.py index 44802c3136a..f42227c9f6e 100644 --- a/scripts/snapdragon/qdc/tests/run_bench_tests_posix.py +++ b/scripts/snapdragon/qdc/tests/run_bench_tests_posix.py @@ -1,11 +1,13 @@ """ On-device bench and completion test runner for llama.cpp (CPU, GPU, NPU backends). -Executed by QDC's Appium test framework on the QDC runner. -The runner has ADB access to the allocated device. +On Android: calls upstream run-*.sh scripts from llama.cpp/scripts/snapdragon/adb/ +on the QDC runner host (scripts wrap commands in ``adb shell`` internally). + +On Linux: runs llama-bench directly via run_linux.sh (BASH framework). Placeholders replaced at artifact creation time by run_qdc_jobs.py: - <> Direct URL to the GGUF model file (downloaded on-device via curl) + <> Direct URL to the GGUF model file (downloaded on-device) """ import os @@ -14,58 +16,75 @@ import pytest -from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log +from utils import ( + BIN_PATH, + MODEL_DEVICE_PATH, + MODEL_NAME, + PROMPT_DIR, + push_bundle_if_needed, + run_adb_command, + run_script, + write_qdc_log, +) -MODEL_PATH = "/data/local/tmp/model.gguf" -PROMPT = "What is the capital of France?" -CLI_OPTS = "--batch-size 128 -n 128 -no-cnv --seed 42" +MODEL_URL = "<>" @pytest.fixture(scope="session", autouse=True) def install(driver): push_bundle_if_needed(f"{BIN_PATH}/llama-cli") - - # Skip model download if already present + run_adb_command(f"mkdir -p /data/local/tmp/gguf {PROMPT_DIR}") + run_adb_command(f"echo 'What is the capital of France?' > {PROMPT_DIR}/bench_prompt.txt") check = subprocess.run( - ["adb", "shell", f"ls {MODEL_PATH}"], + ["adb", "shell", f"ls {MODEL_DEVICE_PATH}"], text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) if check.returncode != 0: - run_adb_command(f'curl -L -J --output {MODEL_PATH} "<>"') - - -@pytest.mark.parametrize("device,extra_flags", [ - pytest.param("none", "-ctk q8_0 -ctv q8_0", id="cpu"), - pytest.param("GPUOpenCL", "", id="gpu"), - pytest.param("HTP0", "-ctk q8_0 -ctv q8_0", id="npu"), -]) -def test_llama_completion(device, extra_flags): - result = run_adb_command( - f'{CMD_PREFIX} {BIN_PATH}/llama-completion' - f' -m {MODEL_PATH} --device {device} -ngl 99 -t 4 {CLI_OPTS} {extra_flags} -fa on' - f' -p "{PROMPT}"', - check=False, + run_adb_command(f'curl -L -J --output {MODEL_DEVICE_PATH} "{MODEL_URL}"') + + +@pytest.mark.parametrize( + "device", + [ + pytest.param("none", id="cpu"), + pytest.param("GPUOpenCL", id="gpu"), + pytest.param("HTP0", id="npu"), + ], +) +def test_llama_completion(device): + result = run_script( + "run-completion.sh", + extra_env={"D": device, "M": MODEL_NAME}, + extra_args=["--batch-size", "128", "-n", "128", "--seed", "42", + "-f", f"{PROMPT_DIR}/bench_prompt.txt"], ) write_qdc_log(f"llama_completion_{device}.log", result.stdout or "") - assert result.returncode == 0, f"llama-completion {device} failed (exit {result.returncode})" + assert result.returncode == 0, ( + f"llama-completion {device} failed (exit {result.returncode})" + ) _DEVICE_LOG_NAME = {"none": "cpu", "GPUOpenCL": "gpu", "HTP0": "htp"} -@pytest.mark.parametrize("device", [ - pytest.param("none", id="cpu"), - pytest.param("GPUOpenCL", id="gpu"), - pytest.param("HTP0", id="npu"), -]) +@pytest.mark.parametrize( + "device", + [ + pytest.param("none", id="cpu"), + pytest.param("GPUOpenCL", id="gpu"), + pytest.param("HTP0", id="npu"), + ], +) def test_llama_bench(device): - result = run_adb_command( - f"{CMD_PREFIX} {BIN_PATH}/llama-bench" - f" -m {MODEL_PATH} --device {device} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32", - check=False, + result = run_script( + "run-bench.sh", + extra_env={"D": device, "M": MODEL_NAME}, + extra_args=["--batch-size", "128", "-p", "128", "-n", "32"], ) write_qdc_log(f"llama_bench_{_DEVICE_LOG_NAME[device]}.log", result.stdout or "") - assert result.returncode == 0, f"llama-bench {device} failed (exit {result.returncode})" + assert result.returncode == 0, ( + f"llama-bench {device} failed (exit {result.returncode})" + ) if __name__ == "__main__": diff --git a/scripts/snapdragon/qdc/tests/utils.py b/scripts/snapdragon/qdc/tests/utils.py index 00f0f1b2f91..fad6a923295 100644 --- a/scripts/snapdragon/qdc/tests/utils.py +++ b/scripts/snapdragon/qdc/tests/utils.py @@ -1,5 +1,7 @@ """Shared helpers for QDC on-device test runners.""" +from __future__ import annotations + import logging import os import subprocess @@ -13,16 +15,14 @@ # On-device paths # --------------------------------------------------------------------------- -BUNDLE_PATH = "/data/local/tmp/llama_cpp_bundle" +BUNDLE_PATH = "/data/local/tmp/llama.cpp" +BIN_PATH = f"{BUNDLE_PATH}/bin" +LIB_PATH = f"{BUNDLE_PATH}/lib" QDC_LOGS_PATH = "/data/local/tmp/QDC_logs" -LIB_PATH = f"{BUNDLE_PATH}/lib" -BIN_PATH = f"{BUNDLE_PATH}/bin" -ENV_PREFIX = ( - f"export LD_LIBRARY_PATH={LIB_PATH} && " - f"export ADSP_LIBRARY_PATH={LIB_PATH} && " - f"chmod +x {BIN_PATH}/* &&" -) -CMD_PREFIX = f"cd {BUNDLE_PATH} && {ENV_PREFIX}" +SCRIPTS_DIR = "/qdc/appium" +MODEL_NAME = "model.gguf" +MODEL_DEVICE_PATH = "/data/local/tmp/gguf/model.gguf" +PROMPT_DIR = "/data/local/tmp/scorecard_prompts" # --------------------------------------------------------------------------- # Appium session options @@ -34,16 +34,47 @@ options.set_capability("deviceName", os.getenv("ANDROID_DEVICE_VERSION")) # --------------------------------------------------------------------------- -# ADB helpers +# Shell / process helpers +# --------------------------------------------------------------------------- + + +def write_qdc_log(filename: str, content: str) -> None: + """Write content as a log file for QDC log collection.""" + subprocess.run( + ["adb", "shell", f"mkdir -p {QDC_LOGS_PATH}"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f: + f.write(content) + tmp_path = f.name + try: + subprocess.run( + ["adb", "push", tmp_path, f"{QDC_LOGS_PATH}/{filename}"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + finally: + os.unlink(tmp_path) + + +def ensure_bundle(check_binary: str | None = None) -> None: + """Ensure the llama_cpp_bundle is available on the target device.""" + push_bundle_if_needed(check_binary or f"{BIN_PATH}/llama-cli") + + +# --------------------------------------------------------------------------- +# Android / Linux host helpers # --------------------------------------------------------------------------- def run_adb_command(cmd: str, *, check: bool = True) -> subprocess.CompletedProcess: - # Append exit-code sentinel because `adb shell` doesn't reliably propagate - # the on-device exit code (older ADB versions always return 0). + """Run a command on-device via ``adb shell`` with exit-code sentinel.""" raw = subprocess.run( ["adb", "shell", f"{cmd}; echo __RC__:$?"], - text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, ) stdout = raw.stdout returncode = raw.returncode @@ -55,39 +86,58 @@ def run_adb_command(cmd: str, *, check: bool = True) -> subprocess.CompletedProc stdout = "\n".join(lines[:-1]) + "\n" except ValueError: pass - log.info("%s", stdout) + log.info(stdout) result = subprocess.CompletedProcess(raw.args, returncode, stdout=stdout) if check: assert returncode == 0, f"Command failed (exit {returncode})" return result -def write_qdc_log(filename: str, content: str) -> None: - """Push content as a log file to QDC_LOGS_PATH on the device for QDC log collection.""" +def run_script( + script: str, + extra_env: dict[str, str] | None = None, + extra_args: list[str] | None = None, +) -> subprocess.CompletedProcess: + """Run an upstream shell script from /qdc/appium/ on the QDC runner host.""" + env = os.environ.copy() + env["GGML_HEXAGON_EXPERIMENTAL"] = "1" + if extra_env: + env.update(extra_env) + cmd = [f"{SCRIPTS_DIR}/{script}"] + (extra_args or []) + result = subprocess.run( + cmd, env=env, + text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + ) + log.info(result.stdout) + return result + + +def adb_shell(cmd: str) -> None: + """Run a command via adb shell (fire-and-forget, no error check).""" subprocess.run( - ["adb", "shell", f"mkdir -p {QDC_LOGS_PATH}"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + ["adb", "shell", "sh", "-c", cmd], + capture_output=True, encoding="utf-8", errors="replace", check=False, ) - with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f: - f.write(content) - tmp_path = f.name - try: - subprocess.run( - ["adb", "push", tmp_path, f"{QDC_LOGS_PATH}/{filename}"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - ) - finally: - os.unlink(tmp_path) def push_bundle_if_needed(check_binary: str) -> None: """Push llama_cpp_bundle to the device if check_binary is not already present.""" result = subprocess.run( ["adb", "shell", f"ls {check_binary}"], - text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, ) if result.returncode != 0: subprocess.run( - ["adb", "push", "/qdc/appium/llama_cpp_bundle/", "/data/local/tmp"], - text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + ["adb", "push", "/qdc/appium/llama_cpp_bundle/", BUNDLE_PATH], + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + subprocess.run( + ["adb", "shell", f"find {BUNDLE_PATH}/bin -type f -exec chmod 755 {{}} +"], + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, ) diff --git a/scripts/webui-download.cmake b/scripts/ui-download.cmake similarity index 74% rename from scripts/webui-download.cmake rename to scripts/ui-download.cmake index 344ba5ecf53..ce1f901e4d1 100644 --- a/scripts/webui-download.cmake +++ b/scripts/ui-download.cmake @@ -1,5 +1,5 @@ -# Download webui assets from Hugging Face Bucket at build time -# Usage: cmake -DPUBLIC_DIR=... -DHF_BUCKET=... -DHF_VERSION=... -DASSETS="a;b;c" -P scripts/webui-download.cmake +# Download UI assets from Hugging Face Bucket at build time +# Usage: cmake -DPUBLIC_DIR=... -DHF_BUCKET=... -DHF_VERSION=... -DASSETS="a;b;c" -P scripts/ui-download.cmake # # Asset provisioning priority: # 1. Pre-built assets already in PUBLIC_DIR (cached from a previous run) @@ -14,7 +14,7 @@ set(HF_VERSION "" CACHE STRING "Version to download (empty = resolve from git) set(ASSETS "" CACHE STRING "Plus-separated list of asset filenames (+)") set(STAMP_FILE "" CACHE STRING "Stamp file to create on success (optional)") set(SOURCE_DIR "" CACHE STRING "Project source root (to resolve version from git)") -set(NPM_DIR "" CACHE STRING "WebUI source directory (to run npm build)") +set(NPM_DIR "" CACHE STRING "UI source directory (to run npm build)") set(HF_ENABLED "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)") # --------------------------------------------------------------------------- @@ -25,8 +25,8 @@ if("${RESOLVED_VERSION}" STREQUAL "" AND NOT "${SOURCE_DIR}" STREQUAL "") if(EXISTS "${SOURCE_DIR}/cmake/build-info.cmake") include("${SOURCE_DIR}/cmake/build-info.cmake") if(NOT "${BUILD_NUMBER}" STREQUAL "" AND NOT BUILD_NUMBER EQUAL 0) - set(RESOLVED_VERSION "${BUILD_NUMBER}") - message(STATUS "WebUI: resolved version from git: ${RESOLVED_VERSION}") + set(RESOLVED_VERSION "b${BUILD_NUMBER}") + message(STATUS "UI: resolved version from git: ${RESOLVED_VERSION}") endif() endif() endif() @@ -43,7 +43,7 @@ if(NOT "${STAMP_FILE}" STREQUAL "" AND EXISTS "${STAMP_FILE}") file(READ "${STAMP_FILE}" STAMPED_VERSION) string(STRIP "${STAMPED_VERSION}" STAMPED_VERSION) if(NOT "${STAMPED_VERSION}" STREQUAL "${RESOLVED_VERSION}") - message(STATUS "WebUI: version changed (${STAMPED_VERSION} -> ${RESOLVED_VERSION}), re-building") + message(STATUS "UI: version changed (${STAMPED_VERSION} -> ${RESOLVED_VERSION}), re-building") set(FORCE_REBUILD TRUE) endif() endif() @@ -60,7 +60,7 @@ foreach(asset ${ASSETS}) endforeach() if(ALL_EXISTS AND NOT FORCE_REBUILD) - message(STATUS "WebUI: all assets already exist in ${PUBLIC_DIR}, skipping") + message(STATUS "UI: all assets already exist in ${PUBLIC_DIR}, skipping") return() endif() @@ -76,27 +76,27 @@ if(NOT PROVISION_SUCCESS AND NOT "${NPM_DIR}" STREQUAL "") # Check if npm is available before attempting npm build find_program(NPM_EXECUTABLE npm) if(NPM_EXECUTABLE) - message(STATUS "WebUI: building from source in ${NPM_DIR}") + message(STATUS "UI: building from source in ${NPM_DIR}") # Run npm install if node_modules is missing if(NOT EXISTS "${NPM_DIR}/node_modules") - message(STATUS "WebUI: running npm install (first time)") + message(STATUS "UI: running npm install (first time)") execute_process( - COMMAND npm install + COMMAND ${NPM_EXECUTABLE} install WORKING_DIRECTORY "${NPM_DIR}" RESULT_VARIABLE NPM_INSTALL_RESULT OUTPUT_VARIABLE NPM_OUT ERROR_VARIABLE NPM_ERR ) if(NOT NPM_INSTALL_RESULT EQUAL 0) - message(STATUS "WebUI: npm install failed (${NPM_INSTALL_RESULT}), falling back to download") + message(STATUS "UI: npm install failed (${NPM_INSTALL_RESULT}), falling back to download") message(STATUS " stderr: ${NPM_ERR}") endif() endif() # Run the build execute_process( - COMMAND npm run build + COMMAND ${NPM_EXECUTABLE} run build WORKING_DIRECTORY "${NPM_DIR}" RESULT_VARIABLE NPM_BUILD_RESULT OUTPUT_VARIABLE NPM_OUT @@ -114,20 +114,20 @@ if(NOT PROVISION_SUCCESS AND NOT "${NPM_DIR}" STREQUAL "") endforeach() if(ALL_BUILT) - message(STATUS "WebUI: local npm build succeeded") + message(STATUS "UI: local npm build succeeded") set(PROVISION_SUCCESS TRUE) else() - message(STATUS "WebUI: npm build completed but assets missing from ${PUBLIC_DIR}, falling back to download") + message(STATUS "UI: npm build completed but assets missing from ${PUBLIC_DIR}, falling back to download") endif() else() - message(STATUS "WebUI: npm build failed (${NPM_BUILD_RESULT}), falling back to download") + message(STATUS "UI: npm build failed (${NPM_BUILD_RESULT}), falling back to download") message(STATUS " stderr: ${NPM_ERR}") endif() else() - message(STATUS "WebUI: npm not found, skipping npm build and trying HF Bucket download") + message(STATUS "UI: npm not found, skipping npm build and trying HF Bucket download") endif() else() - message(STATUS "WebUI: NPM_DIR (${NPM_DIR}) has no package.json, skipping npm build") + message(STATUS "UI: NPM_DIR (${NPM_DIR}) has no package.json, skipping npm build") endif() endif() @@ -148,7 +148,7 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) string(REGEX REPLACE "^([^:]+):.*$" "\\1" url_label "${entry}") string(REGEX REPLACE "^[^:]+:(.*)$" "\\1" base_url "${entry}") - message(STATUS "WebUI: downloading assets from ${url_label}: ${base_url}") + message(STATUS "UI: downloading assets from ${url_label}: ${base_url}") # Download each asset set(ALL_OK TRUE) @@ -161,11 +161,11 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) list(GET download_status 0 download_result) if(NOT download_result EQUAL 0) list(GET download_status 1 error_message) - message(STATUS "WebUI: failed to download ${asset} from ${url_label}: ${error_message}") + message(STATUS "UI: failed to download ${asset} from ${url_label}: ${error_message}") set(ALL_OK FALSE) break() endif() - message(STATUS "WebUI: downloaded ${asset}") + message(STATUS "UI: downloaded ${asset}") endforeach() if(NOT ALL_OK) @@ -179,20 +179,24 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) ) list(GET checksum_status 0 checksum_result) if(checksum_result EQUAL 0) - message(STATUS "WebUI: verifying checksums...") + message(STATUS "UI: verifying checksums...") file(STRINGS "${PUBLIC_DIR}/checksums.txt" CHECKSUMS_CONTENT) foreach(asset ${ASSETS}) set(download_path "${PUBLIC_DIR}/${asset}") file(SHA256 "${download_path}" asset_hash) string(TOUPPER "${asset_hash}" EXPECTED_HASH_UPPER) - string(REGEX MATCH "${EXPECTED_HASH_UPPER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}") + # Escape regex-special characters in asset name to avoid false positives + # (e.g., '.' in 'bundle.js' matching any character) + string(REGEX REPLACE "([][.+*?^$(){|}\\])" "\\1" ASSET_ESCAPED "${asset}") + string(REGEX MATCH "${EXPECTED_HASH_UPPER}[ \\t]+${ASSET_ESCAPED}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}") if(NOT CHECKSUM_LINE) - message(WARNING "WebUI: checksum verification failed for ${asset}") - message(WARNING " downloaded file may not match expected checksum, but will be used") + message(WARNING "UI: checksum verification failed for ${asset}") + set(ALL_OK FALSE) + break() endif() endforeach() if(ALL_OK) - message(STATUS "WebUI: all checksums verified") + message(STATUS "UI: all checksums verified") endif() endif() @@ -203,9 +207,9 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) endforeach() if(PROVISION_SUCCESS) - message(STATUS "WebUI: provisioning complete") + message(STATUS "UI: provisioning complete") else() - message(WARNING "WebUI: failed to download assets from HF Bucket (${HF_BUCKET})") + message(WARNING "UI: failed to download assets from HF Bucket (${HF_BUCKET})") endif() endif() @@ -217,6 +221,6 @@ if(PROVISION_SUCCESS) file(WRITE "${STAMP_FILE}" "${RESOLVED_VERSION}") endif() else() - message(WARNING "WebUI: no source available. Neither local build (${NPM_DIR}) nor HF Bucket download succeeded.") - message(WARNING "WebUI: building server without embedded WebUI. Set LLAMA_BUILD_WEBUI=OFF to suppress this warning.") + message(WARNING "UI: no source available. Neither local build (${NPM_DIR}) nor HF Bucket download succeeded.") + message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.") endif() diff --git a/scripts/xxd.cmake b/scripts/xxd.cmake index 14d2753808a..00f32bbf4d6 100644 --- a/scripts/xxd.cmake +++ b/scripts/xxd.cmake @@ -1,5 +1,5 @@ # CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}` -# Usage: cmake -DINPUT=tools/server/public/index.html -DOUTPUT=tools/server/index.html.hpp -P scripts/xxd.cmake +# Usage: cmake -DINPUT=build/tools/ui/index.html -DOUTPUT=build/tools/ui/index.html.hpp -P scripts/xxd.cmake SET(INPUT "" CACHE STRING "Input File") SET(OUTPUT "" CACHE STRING "Output File") diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index ea9d87ebed2..05c60d29743 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1664,6 +1664,83 @@ static void test_convert_responses_to_chatcmpl() { assert_equals(false, result.contains("max_output_tokens")); assert_equals(100, result.at("max_tokens").get()); } + + // Test mixed Responses tools: convert only function tools + { + json input = json::parse(R"({ + "input": "Hello", + "model": "test-model", + "tools": [ + { + "type": "web_search" + }, + { + "type": "function", + "name": "get_weather", + "description": "Get weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string" + } + }, + "required": ["location"] + } + }, + { + "type": "image_generation" + }, + { + "type": "mcp", + "server_label": "test-server" + }, + { + "type": "namespace", + "name": "browser" + } + ] + })"); + + json result = server_chat_convert_responses_to_chatcmpl(input); + + assert_equals(true, result.contains("tools")); + assert_equals(true, result.at("tools").is_array()); + assert_equals((size_t)1, result.at("tools").size()); + + const auto & tool = result.at("tools")[0]; + assert_equals(std::string("function"), tool.at("type").get()); + assert_equals(std::string("get_weather"), tool.at("function").at("name").get()); + assert_equals(true, tool.at("function").at("strict").get()); + } + + // Test non-function Responses tools are ignored + { + json input = json::parse(R"({ + "input": "Hello", + "model": "test-model", + "tools": [ + { + "type": "web_search" + }, + { + "type": "image_generation" + }, + { + "type": "mcp", + "server_label": "test-server" + }, + { + "type": "namespace", + "name": "browser" + } + ] + })"); + + json result = server_chat_convert_responses_to_chatcmpl(input); + + assert_equals(false, result.contains("tools")); + } } static void test_template_output_peg_parsers(bool detailed_debug) { diff --git a/tests/test-reasoning-budget.cpp b/tests/test-reasoning-budget.cpp index f7a60178996..10d0ac21da8 100644 --- a/tests/test-reasoning-budget.cpp +++ b/tests/test-reasoning-budget.cpp @@ -124,6 +124,66 @@ static void test_reasoning_budget( (void)sequence; } +static llama_token get_forced_token(struct llama_sampler * sampler, llama_token max_token) { + std::vector cur; + const size_t n_vocab = (size_t) max_token + 1; + for (size_t i = 0; i < n_vocab; i++) { + cur.emplace_back(llama_token_data{(llama_token) i, logf((float) (i + 1)), 0.0f}); + } + + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + llama_sampler_apply(sampler, &cur_p); + + size_t finite_count = 0; + llama_token finite_token = LLAMA_TOKEN_NULL; + for (size_t i = 0; i < cur.size(); i++) { + if (std::isfinite(cur[i].logit)) { + finite_count++; + finite_token = cur[i].id; + } + } + + GGML_ASSERT(finite_count == 1 && "sampler is not forcing exactly one token"); + return finite_token; +} + +static void test_reasoning_budget_clone_mid_counting() { + const std::vector start = {100}; + const std::vector end = {101}; + const std::vector forced = {102, 101}; + + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 2, REASONING_BUDGET_IDLE); + + llama_sampler_accept(sampler, 100); // COUNTING, remaining=2 + llama_sampler_accept(sampler, 50); // COUNTING, remaining=1 + + auto * clone = llama_sampler_clone(sampler); + llama_sampler_accept(clone, 51); // should exhaust the cloned remaining budget + + GGML_ASSERT(get_forced_token(clone, 102) == 102 && "cloned counting state lost remaining budget"); + + llama_sampler_free(clone); + llama_sampler_free(sampler); +} + +static void test_reasoning_budget_clone_mid_forcing() { + const std::vector start = {100}; + const std::vector end = {101}; + const std::vector forced = {102, 101}; + + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 0, REASONING_BUDGET_FORCING); + + GGML_ASSERT(get_forced_token(sampler, 102) == 102); + llama_sampler_accept(sampler, 102); // advance to the second forced token + + auto * clone = llama_sampler_clone(sampler); + + GGML_ASSERT(get_forced_token(clone, 102) == 101 && "cloned forcing state lost force position"); + + llama_sampler_free(clone); + llama_sampler_free(sampler); +} + // UTF-8 boundary detection unit test // Tests common_utf8_is_complete() from reasoning-budget.h static void test_utf8_boundary_detection() { @@ -250,7 +310,10 @@ int main(void) { 7); // forcing continues through i=7 } - printf("OK (6 tests passed)\n"); + test_reasoning_budget_clone_mid_counting(); + test_reasoning_budget_clone_mid_forcing(); + + printf("OK (8 tests passed)\n"); printf("Testing UTF-8 boundary detection... "); test_utf8_boundary_detection(); diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index 20e4eb37697..ca55ad615b5 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -40,11 +40,28 @@ set(TARGET_SRCS server-models.h ) -# Option to specify custom HF bucket for webui (defaults to llama-ui) -# Usage: cmake -B build -DLLAMA_WEBUI_HF_BUCKET=llama-ui -set(LLAMA_WEBUI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt webui assets") +# -------------------------------------------------------------------------- +# Web UI / UI build configuration +# -------------------------------------------------------------------------- + +# Deprecated: use LLAMA_UI_HF_BUCKET instead +# Usage: cmake -B build -DLLAMA_UI_HF_BUCKET=llama-ui +set(LLAMA_WEBUI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt webui assets (deprecated: use LLAMA_UI_HF_BUCKET)") +set(LLAMA_UI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt UI assets") + +# Backward compat: forward old var to new one +if(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT DEFINED LLAMA_UI_HF_BUCKET) + set(LLAMA_UI_HF_BUCKET ${LLAMA_WEBUI_HF_BUCKET}) +elseif(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT "${LLAMA_WEBUI_HF_BUCKET}" STREQUAL "${LLAMA_UI_HF_BUCKET}") + message(DEPRECATION "LLAMA_WEBUI_HF_BUCKET is deprecated, use LLAMA_UI_HF_BUCKET instead") +endif() + +# Support both old (LLAMA_BUILD_WEBUI) and new (LLAMA_BUILD_UI) option names +if(LLAMA_BUILD_WEBUI OR LLAMA_BUILD_UI) + if(LLAMA_BUILD_WEBUI AND NOT LLAMA_BUILD_UI) + message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead") + endif() -if (LLAMA_BUILD_WEBUI) set(PUBLIC_ASSETS index.html bundle.js @@ -52,94 +69,102 @@ if (LLAMA_BUILD_WEBUI) loading.html ) - # Determine source of webui assets (priority: local > HF Bucket) - set(WEBUI_SOURCE "") - set(WEBUI_SOURCE_DIR "") + # Determine source of UI assets (priority: local > HF Bucket) + set(UI_SOURCE "") + set(UI_SOURCE_DIR "") - # Priority 1: Check for local webui build output - set(LOCAL_WEBUI_DIR "${CMAKE_CURRENT_SOURCE_DIR}/public") + # Priority 1: Check for local build output + set(LOCAL_UI_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui") # Verify all required assets exist before declaring local source valid set(ALL_ASSETS_PRESENT TRUE) foreach(asset ${PUBLIC_ASSETS}) - if(NOT EXISTS "${LOCAL_WEBUI_DIR}/${asset}") + if(NOT EXISTS "${LOCAL_UI_DIR}/${asset}") set(ALL_ASSETS_PRESENT FALSE) break() endif() endforeach() if(ALL_ASSETS_PRESENT) - set(WEBUI_SOURCE "local") - set(WEBUI_SOURCE_DIR "${LOCAL_WEBUI_DIR}") - message(STATUS "WebUI: using local build from ${WEBUI_SOURCE_DIR}") + set(UI_SOURCE "local") + set(UI_SOURCE_DIR "${LOCAL_UI_DIR}") + message(STATUS "UI: using local build from ${UI_SOURCE_DIR}") endif() # Priority 2: Build-time asset provisioning (npm build → HF Bucket fallback) - if(NOT WEBUI_SOURCE_DIR) + if(NOT UI_SOURCE_DIR) # Environment variable takes precedence (e.g., from CI workflows) + # Deprecated: use HF_UI_VERSION instead if(DEFINED ENV{HF_WEBUI_VERSION}) - set(HF_WEBUI_VERSION "$ENV{HF_WEBUI_VERSION}") + set(HF_UI_VERSION "$ENV{HF_WEBUI_VERSION}") + message(DEPRECATION "HF_WEBUI_VERSION env var is deprecated, use HF_UI_VERSION instead") + # Validate against allowed characters to prevent CMake list separator + # or path-traversal issues in stamp filenames and download URLs + if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$") + message(FATAL_ERROR "UI: invalid HF_WEBUI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$") + endif() + elseif(DEFINED ENV{HF_UI_VERSION}) + set(HF_UI_VERSION "$ENV{HF_UI_VERSION}") # Validate against allowed characters to prevent CMake list separator # or path-traversal issues in stamp filenames and download URLs - if(NOT HF_WEBUI_VERSION MATCHES "^[A-Za-z0-9._-]+$") - message(FATAL_ERROR "WebUI: invalid HF_WEBUI_VERSION='${HF_WEBUI_VERSION}' - must match ^[A-Za-z0-9._-]+$") + if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$") + message(FATAL_ERROR "UI: invalid HF_UI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$") endif() - message(STATUS "WebUI: using HF_WEBUI_VERSION from environment=${HF_WEBUI_VERSION}") elseif(DEFINED LLAMA_BUILD_NUMBER) - set(HF_WEBUI_VERSION "b${LLAMA_BUILD_NUMBER}") - message(STATUS "WebUI: using LLAMA_BUILD_NUMBER=${HF_WEBUI_VERSION}") + set(HF_UI_VERSION "b${LLAMA_BUILD_NUMBER}") + message(STATUS "UI: derived HF_UI_VERSION=b${LLAMA_BUILD_NUMBER}") else() - set(HF_WEBUI_VERSION "") - message(STATUS "WebUI: version not specified (will use HF 'latest')") + set(HF_UI_VERSION "") + message(STATUS "UI: version not specified (will use HF 'latest')") endif() # Stamp file embeds the version tag so a changed build number triggers # a fresh provision run on the next `cmake --build` without reconfiguring. - if("${HF_WEBUI_VERSION}" STREQUAL "") - set(WEBUI_VERSION_TAG "provisioned") + if("${HF_UI_VERSION}" STREQUAL "") + set(UI_VERSION_TAG "provisioned") else() - set(WEBUI_VERSION_TAG "${HF_WEBUI_VERSION}") + set(UI_VERSION_TAG "${HF_UI_VERSION}") endif() - set(WEBUI_STAMP "${CMAKE_CURRENT_BINARY_DIR}/.webui-${WEBUI_VERSION_TAG}.stamp") + set(UI_STAMP "${CMAKE_CURRENT_BINARY_DIR}/.ui-${UI_VERSION_TAG}.stamp") # Join assets with + separator (safe across all platforms, unlike ; and |) string(REPLACE ";" "+" PUBLIC_ASSETS_JOINED "${PUBLIC_ASSETS}") add_custom_command( - OUTPUT ${WEBUI_STAMP} + OUTPUT ${UI_STAMP} COMMAND ${CMAKE_COMMAND} "-DSOURCE_DIR=${PROJECT_SOURCE_DIR}" - "-DPUBLIC_DIR=${CMAKE_CURRENT_SOURCE_DIR}/public" - "-DHF_BUCKET=${LLAMA_WEBUI_HF_BUCKET}" - "-DHF_VERSION=${HF_WEBUI_VERSION}" - "-DHF_ENABLED=${LLAMA_USE_PREBUILT_WEBUI}" + "-DPUBLIC_DIR=${PROJECT_SOURCE_DIR}/build/tools/ui" + "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}" + "-DHF_VERSION=${HF_UI_VERSION}" + "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}" "-DASSETS=${PUBLIC_ASSETS_JOINED}" - "-DSTAMP_FILE=${WEBUI_STAMP}" - "-DNPM_DIR=${CMAKE_CURRENT_SOURCE_DIR}/webui" - -P ${PROJECT_SOURCE_DIR}/scripts/webui-download.cmake - COMMENT "Building/provisioning WebUI assets (npm build -> HF Bucket fallback)" + "-DSTAMP_FILE=${UI_STAMP}" + "-DNPM_DIR=${PROJECT_SOURCE_DIR}/tools/ui" + -P ${PROJECT_SOURCE_DIR}/scripts/ui-download.cmake + COMMENT "Building/provisioning UI assets (npm build -> HF Bucket fallback)" ) - set(WEBUI_SOURCE "provisioned") - set(WEBUI_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/public") + set(UI_SOURCE "provisioned") + set(UI_SOURCE_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui") endif() # Process assets from the determined source - if(WEBUI_SOURCE_DIR) + if(UI_SOURCE_DIR) foreach(asset ${PUBLIC_ASSETS}) - set(input "${WEBUI_SOURCE_DIR}/${asset}") + set(input "${UI_SOURCE_DIR}/${asset}") set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") list(APPEND TARGET_SRCS ${output}) - if(WEBUI_SOURCE STREQUAL "local") + if(UI_SOURCE STREQUAL "local") # Local build: files exist at configure time if(NOT EXISTS "${input}") - message(FATAL_ERROR "WebUI asset not found: ${input}") + message(FATAL_ERROR "UI asset not found: ${input}") endif() set(dependency "${input}") else() # HF Bucket: files are downloaded at build time - set(dependency "${WEBUI_STAMP}") + set(dependency "${UI_STAMP}") endif() add_custom_command( @@ -150,19 +175,24 @@ if (LLAMA_BUILD_WEBUI) set_source_files_properties(${output} PROPERTIES GENERATED TRUE) endforeach() - add_definitions(-DLLAMA_BUILD_WEBUI) - add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=1) - message(STATUS "WebUI: embedded with source: ${WEBUI_SOURCE}") + # Define both old and new preprocessor symbols for backward compat + add_definitions(-DLLAMA_BUILD_WEBUI) # Deprecated: use LLAMA_BUILD_UI + add_definitions(-DLLAMA_BUILD_UI) + add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=1) # Deprecated: use LLAMA_UI_DEFAULT_ENABLED + add_definitions(-DLLAMA_UI_DEFAULT_ENABLED=1) + message(STATUS "UI: embedded with source: ${UI_SOURCE}") else() - # WebUI source not found - issue warning but don't fail the build - # The server will still build but without webui embedded - message(WARNING "WebUI: no source available. Neither local build (tools/server/public/) nor HF Bucket download succeeded.") - message(WARNING "WebUI: building server without embedded WebUI. Set LLAMA_BUILD_WEBUI=OFF to suppress this warning.") - add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=0) + # UI source not found - issue warning but don't fail the build + # The server will still build but without UI embedded + message(WARNING "UI: no source available. Neither local build (build/tools/ui/) nor HF Bucket download succeeded.") + message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.") + add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=0) # Deprecated + add_definitions(-DLLAMA_UI_DEFAULT_ENABLED=0) endif() else() - # WebUI is disabled at build time - add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=0) + # UI is disabled at build time + add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=0) # Deprecated + add_definitions(-DLLAMA_UI_DEFAULT_ENABLED=0) endif() add_executable(${TARGET} ${TARGET_SRCS}) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index a9c1e7385fc..0ff334724a3 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -224,7 +224,7 @@ The SvelteKit-based Web UI is introduced in this PR: https://github.com/ggml-org ### Architecture -The WebUI follows a layered architecture: +The UI follows a layered architecture: ``` Routes → Components → Hooks → Stores → Services → Storage/API @@ -234,7 +234,7 @@ Routes → Components → Hooks → Stores → Services → Storage/API - **Services** - stateless API/database communication (`ChatService`, `ModelsService`, `PropsService`, `DatabaseService`) - **Hooks** - reusable logic (`useModelChangeValidation`, `useProcessingState`) -For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/): +For detailed architecture diagrams, see [`tools/ui/docs/`](../ui/docs/): - `high-level-architecture.mmd` - full architecture with all modules - `high-level-architecture-simplified.mmd` - simplified overview @@ -246,7 +246,7 @@ For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/ ```sh # make sure you have Node.js installed -cd tools/server/webui +cd tools/ui npm i # run dev server (with hot reload) diff --git a/tools/server/README.md b/tools/server/README.md index 5ba1a58f8c1..6dfed724e4c 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -189,11 +189,11 @@ For the full list of features, please refer to [server's changelog](https://gith | `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)
(env: LLAMA_ARG_REUSE_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | -| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | -| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | -| `--webui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_WEBUI_MCP_PROXY) | +| `--ui-config JSON` / `--webui-config JSON` (deprecated) | JSON that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG / LLAMA_ARG_WEBUI_CONFIG) | +| `--ui-config-file PATH` / `--webui-config-file PATH` (deprecated) | JSON file that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG_FILE / LLAMA_ARG_WEBUI_CONFIG_FILE) | +| `--ui-mcp-proxy, --no-ui-mcp-proxy` / `--webui-mcp-proxy, --no-webui-mcp-proxy` (deprecated) | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_UI_MCP_PROXY / LLAMA_ARG_WEBUI_MCP_PROXY) | | `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)
specify "all" to enable all tools
available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime
(env: LLAMA_ARG_TOOLS) | -| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | +| `--ui, --no-ui` / `--webui, --no-webui` (deprecated) | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_UI / LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)
(env: LLAMA_API_KEY) | @@ -1815,10 +1815,12 @@ Apart from error types supported by OAI, we also have custom types that are spec ### Custom default Web UI preferences -You can specify default preferences for the web UI using `--webui-config ` or `--webui-config-file `. For example, you can disable pasting long text as attachments and enable rendering Markdown in user messages with this command: +You can specify default preferences for the web UI using `--ui-config ` or `--ui-config-file `. For example, you can disable pasting long text as attachments and enable rendering Markdown in user messages with this command: ```bash -./llama-server -m model.gguf --webui-config '{"pasteLongTextToFileLen": 0, "renderUserContentAsMarkdown": true}' +./llama-server -m model.gguf --ui-config '{"pasteLongTextToFileLen": 0, "renderUserContentAsMarkdown": true}' ``` -You may find available preferences in [settings-config.ts](webui/src/lib/constants/settings-config.ts). +> **Note:** The old flags `--webui-config` and `--webui-config-file` are deprecated but still work as aliases. + +You may find available preferences in [settings-config.ts](../ui/src/lib/constants/settings-config.ts). diff --git a/tools/server/server-chat.cpp b/tools/server/server-chat.cpp index f276f8da58f..02858a2a028 100644 --- a/tools/server/server-chat.cpp +++ b/tools/server/server-chat.cpp @@ -257,8 +257,11 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) { for (json resp_tool : response_body.at("tools")) { json chatcmpl_tool; - if (json_value(resp_tool, "type", std::string()) != "function") { - throw std::invalid_argument("'type' of tool must be 'function'"); + const std::string type = json_value(resp_tool, "type", std::string()); + if (type != "function") { + // Non-function Responses tools have no Chat Completions equivalent. + SRV_WRN("unsupported Responses tool type '%s' skipped\n", type.c_str()); + continue; } resp_tool.erase("type"); chatcmpl_tool["type"] = "function"; @@ -270,7 +273,9 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) { chatcmpl_tools.push_back(chatcmpl_tool); } chatcmpl_body.erase("tools"); - chatcmpl_body["tools"] = chatcmpl_tools; + if (!chatcmpl_tools.empty()) { + chatcmpl_body["tools"] = chatcmpl_tools; + } } if (response_body.contains("max_output_tokens")) { diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index d49c986feef..1dc19536866 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -671,7 +671,8 @@ struct server_context_impl { server_metrics metrics; - json json_webui_settings = json::object(); + json json_ui_settings = json::object(); // Primary: new name + json json_webui_settings = json::object(); // Deprecated: use json_ui_settings instead (kept for compat) // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; @@ -996,13 +997,18 @@ struct server_context_impl { } } - // populate webui settings + // populate UI settings (from either new ui_config_json or deprecated webui_config_json) { - if (!params_base.webui_config_json.empty()) { + const std::string & cfg = !params_base.ui_config_json.empty() + ? params_base.ui_config_json + : params_base.webui_config_json; + if (!cfg.empty()) { try { - json_webui_settings = json::parse(params_base.webui_config_json); + json json_settings = json::parse(cfg); + json_ui_settings = json_settings; + json_webui_settings = json_settings; // deprecated: keep in sync } catch (const std::exception & e) { - SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + SRV_ERR("%s: failed to parse UI config: %s\n", __func__, e.what()); return false; } } @@ -3292,7 +3298,8 @@ server_context_meta server_context::get_meta() const { /* has_mtmd */ impl->mctx != nullptr, /* has_inp_image */ impl->chat_params.allow_image, /* has_inp_audio */ impl->chat_params.allow_audio, - /* json_webui_settings */ impl->json_webui_settings, + /* json_ui_settings */ impl->json_ui_settings, + /* json_webui_settings */ impl->json_webui_settings, // Deprecated /* slot_n_ctx */ impl->get_slot_n_ctx(), /* pooling_type */ llama_pooling_type(impl->ctx_tgt), @@ -3814,8 +3821,12 @@ void server_routes::init_routes() { { "endpoint_slots", params.endpoint_slots }, { "endpoint_props", params.endpoint_props }, { "endpoint_metrics", params.endpoint_metrics }, - { "webui", params.webui }, - { "webui_settings", meta->json_webui_settings }, + // New keys + { "ui", params.ui }, + { "ui_settings", meta->json_ui_settings }, + // Deprecated: use ui/ui_settings instead (kept for backward compat) + { "webui", params.webui }, + { "webui_settings", meta->json_webui_settings }, { "chat_template", tmpl_default }, { "chat_template_caps", meta->chat_template_caps }, { "bos_token", meta->bos_token_str }, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 58dda891441..65853438c93 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -21,7 +21,8 @@ struct server_context_meta { bool has_mtmd; bool has_inp_image; bool has_inp_audio; - json json_webui_settings; + json json_ui_settings; // Primary: new name + json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat) int slot_n_ctx; enum llama_pooling_type pooling_type; diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index af4536fdd77..02c4b53efe8 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -238,10 +238,11 @@ bool server_http_context::init(const common_params & params) { }; auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) { - (void)req; // suppress unused parameter warning when LLAMA_BUILD_WEBUI is not defined + (void)req; // suppress unused parameter warning when LLAMA_BUILD_UI / LLAMA_BUILD_WEBUI is not defined bool ready = is_ready.load(); if (!ready) { -#ifdef LLAMA_BUILD_WEBUI +// Support both old and new preprocessor defines +#if defined(LLAMA_BUILD_UI) || defined(LLAMA_BUILD_WEBUI) auto tmp = string_split(req.path, '.'); if (req.path == "/" || (tmp.size() > 0 && tmp.back() == "html")) { res.status = 503; @@ -305,8 +306,10 @@ bool server_http_context::init(const common_params & params) { // Web UI setup // - if (!params.webui) { - SRV_INF("%s", "the WebUI is disabled\n"); + // Use new `params.ui` field (backed by old `params.webui` for compat) + if (!params.ui) { + SRV_INF("%s", "The UI is disabled\n"); + SRV_INF("%s", "Use --ui/--no-ui (or deprecated --webui/--no-webui) to enable/disable\n"); } else { // register static assets routes if (!params.public_path.empty()) { @@ -317,7 +320,8 @@ bool server_http_context::init(const common_params & params) { return 1; } } else { -#ifdef LLAMA_BUILD_WEBUI +// Support both old and new preprocessor defines +#if defined(LLAMA_BUILD_UI) || defined(LLAMA_BUILD_WEBUI) // using embedded static index.html srv->Get(params.api_prefix + "/", [](const httplib::Request & /*req*/, httplib::Response & res) { // COEP and COOP headers, required by pyodide (python interpreter) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 698489a11e2..433d2d8f04e 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -1152,14 +1152,17 @@ void server_models_routes::init_routes() { {"role", "router"}, {"max_instances", params.models_max}, {"models_autoload", params.models_autoload}, - // this is a dummy response to make sure webui doesn't break + // this is a dummy response to make sure the UI doesn't break {"model_alias", "llama-server"}, {"model_path", "none"}, {"default_generation_settings", { {"params", json{}}, {"n_ctx", 0}, }}, - {"webui_settings", webui_settings}, + // New key + {"ui_settings", ui_settings}, + // Deprecated: use ui_settings instead (kept for backward compat) + {"webui_settings", webui_settings}, {"build_info", std::string(llama_build_info())}, }); return res; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index f1206c71448..e96d76c9169 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -175,15 +175,22 @@ struct server_models { struct server_models_routes { common_params params; - json webui_settings = json::object(); + json ui_settings = json::object(); // Primary: new name + json webui_settings = json::object(); // Deprecated: use ui_settings (kept for compat) server_models models; server_models_routes(const common_params & params, int argc, char ** argv) : params(params), models(params, argc, argv) { - if (!this->params.webui_config_json.empty()) { + // Support both new ui_config_json and deprecated webui_config_json + const std::string & cfg = !this->params.ui_config_json.empty() + ? this->params.ui_config_json + : this->params.webui_config_json; + if (!cfg.empty()) { try { - webui_settings = json::parse(this->params.webui_config_json); + json json_settings = json::parse(cfg); + ui_settings = json_settings; + webui_settings = json_settings; // Deprecated: keep in sync } catch (const std::exception & e) { - LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + LOG_ERR("%s: failed to parse UI config: %s\n", __func__, e.what()); throw; } } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 823ae5bda36..a232550789c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -208,7 +208,8 @@ int main(int argc, char ** argv) { ctx_http.register_gcp_compat(); // CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP) - if (params.webui_mcp_proxy) { + // Supports both new ui_mcp_proxy and deprecated webui_mcp_proxy fields + if (params.ui_mcp_proxy || params.webui_mcp_proxy) { SRV_WRN("%s", "-----------------\n"); SRV_WRN("%s", "CORS proxy is enabled, do not expose server to untrusted environments\n"); SRV_WRN("%s", "This feature is EXPERIMENTAL and may be removed or changed in future versions\n"); diff --git a/tools/server/webui/scripts/post-build.sh b/tools/server/webui/scripts/post-build.sh deleted file mode 100755 index 55e46d5d5c6..00000000000 --- a/tools/server/webui/scripts/post-build.sh +++ /dev/null @@ -1,3 +0,0 @@ -rm -rf ../public/_app; -rm ../public/favicon.svg; -rm -f ../public/index.html.gz; # deprecated, but may still be generated by older versions of the build process diff --git a/tools/server/webui/src/lib/constants/localstorage-keys.ts b/tools/server/webui/src/lib/constants/localstorage-keys.ts deleted file mode 100644 index a04194c46f6..00000000000 --- a/tools/server/webui/src/lib/constants/localstorage-keys.ts +++ /dev/null @@ -1,6 +0,0 @@ -export const ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY = 'LlamaCppWebui.alwaysAllowedTools'; -export const CONFIG_LOCALSTORAGE_KEY = 'LlamaCppWebui.config'; -export const DISABLED_TOOLS_LOCALSTORAGE_KEY = 'LlamaCppWebui.disabledTools'; -export const FAVORITE_MODELS_LOCALSTORAGE_KEY = 'LlamaCppWebui.favoriteModels'; -export const MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = 'LlamaCppWebui.mcpDefaultEnabled'; -export const USER_OVERRIDES_LOCALSTORAGE_KEY = 'LlamaCppWebui.userOverrides'; diff --git a/tools/server/webui/.gitignore b/tools/ui/.gitignore similarity index 100% rename from tools/server/webui/.gitignore rename to tools/ui/.gitignore diff --git a/tools/server/webui/.npmrc b/tools/ui/.npmrc similarity index 100% rename from tools/server/webui/.npmrc rename to tools/ui/.npmrc diff --git a/tools/server/webui/.prettierignore b/tools/ui/.prettierignore similarity index 100% rename from tools/server/webui/.prettierignore rename to tools/ui/.prettierignore diff --git a/tools/server/webui/.prettierrc b/tools/ui/.prettierrc similarity index 100% rename from tools/server/webui/.prettierrc rename to tools/ui/.prettierrc diff --git a/tools/server/webui/.storybook/decorators/ModeWatcherDecorator.svelte b/tools/ui/.storybook/decorators/ModeWatcherDecorator.svelte similarity index 100% rename from tools/server/webui/.storybook/decorators/ModeWatcherDecorator.svelte rename to tools/ui/.storybook/decorators/ModeWatcherDecorator.svelte diff --git a/tools/server/webui/.storybook/decorators/TooltipProviderDecorator.svelte b/tools/ui/.storybook/decorators/TooltipProviderDecorator.svelte similarity index 100% rename from tools/server/webui/.storybook/decorators/TooltipProviderDecorator.svelte rename to tools/ui/.storybook/decorators/TooltipProviderDecorator.svelte diff --git a/tools/server/webui/.storybook/main.ts b/tools/ui/.storybook/main.ts similarity index 100% rename from tools/server/webui/.storybook/main.ts rename to tools/ui/.storybook/main.ts diff --git a/tools/server/webui/.storybook/preview.ts b/tools/ui/.storybook/preview.ts similarity index 100% rename from tools/server/webui/.storybook/preview.ts rename to tools/ui/.storybook/preview.ts diff --git a/tools/server/webui/.storybook/vitest.setup.ts b/tools/ui/.storybook/vitest.setup.ts similarity index 100% rename from tools/server/webui/.storybook/vitest.setup.ts rename to tools/ui/.storybook/vitest.setup.ts diff --git a/tools/server/webui/README.md b/tools/ui/README.md similarity index 96% rename from tools/server/webui/README.md rename to tools/ui/README.md index 40742b00e1a..4c7671f871b 100644 --- a/tools/server/webui/README.md +++ b/tools/ui/README.md @@ -2,7 +2,7 @@ A modern, feature-rich web interface for llama-server built with SvelteKit. This UI provides an intuitive chat interface with advanced file handling, conversation management, and comprehensive model interaction capabilities. -The WebUI supports two server operation modes: +Llama UI supports two server operation modes: - **MODEL mode** - Single model operation (standard llama-server) - **ROUTER mode** - Multi-model operation with dynamic model loading/unloading @@ -88,7 +88,7 @@ The WebUI supports two server operation modes: ### 1. Install Dependencies ```bash -cd tools/server/webui +cd tools/ui npm install ``` @@ -112,7 +112,7 @@ npm run dev This starts: -- **Vite dev server** at `http://localhost:5173` - The main WebUI +- **Vite dev server** at `http://localhost:5173` - The main UI frontend app - **Storybook** at `http://localhost:6006` - Component documentation The Vite dev server proxies API requests to `http://localhost:8080` (default llama-server port): @@ -186,7 +186,7 @@ npm run build The build process: 1. **Vite Build** - Bundles all TypeScript, Svelte, and CSS -2. **Static Adapter** - Outputs to `../public` (llama-server's static file directory) +2. **Static Adapter** - Outputs to `../../../build/tools/ui` (llama-server's static file directory) 3. **Post-Build Script** - Cleans up intermediate files 4. **Custom Plugin** - Creates `index.html` with: - Inlined favicon as base64 @@ -194,7 +194,7 @@ The build process: - Deterministic output (zeroed timestamps) ```text -tools/server/webui/ → build → tools/server/public/ +tools/ui/ → build → build/tools/ui/ ├── src/ ├── index.html (served by llama-server) ├── static/ └── (favicon inlined) └── ... @@ -205,8 +205,8 @@ tools/server/webui/ → build → tools/server/public/ ```javascript // svelte.config.js adapter: adapter({ - pages: '../public', // Output directory - assets: '../public', // Static assets + pages: '../../../build/tools/ui', // Output directory + assets: '../../../build/tools/ui', // Static assets fallback: 'index.html', // SPA fallback strict: true }), @@ -217,20 +217,19 @@ output: { ### Integration with llama-server -The WebUI is embedded directly into the llama-server binary: +llama-ui is embedded directly into the llama-server binary: -1. `npm run build` outputs `index.html` to `tools/server/public/` +1. `npm run build` outputs `index.html` to `build/tools/ui/` 2. llama-server compiles this into the binary at build time -3. When accessing `/`, llama-server serves the gzipped HTML -4. All assets are inlined (CSS, JS, fonts, favicon) +3. When accessing `/`, llama-server serves the bundled HTML -This results in a **single portable binary** with the full WebUI included. +This results in a **single portable binary** with the full Llama UI included. --- ## Architecture -The WebUI follows a layered architecture with unidirectional data flow: +Llama UI follows a layered architecture with unidirectional data flow: ```text Routes → Components → Hooks → Stores → Services → Storage/API @@ -659,7 +658,7 @@ npm run check # TypeScript type checking ## Project Structure ```text -tools/server/webui/ +tools/ui/ ├── src/ │ ├── lib/ │ │ ├── components/ # UI components (app/, ui/) diff --git a/tools/server/webui/components.json b/tools/ui/components.json similarity index 100% rename from tools/server/webui/components.json rename to tools/ui/components.json diff --git a/tools/server/webui/docs/architecture/high-level-architecture-simplified.md b/tools/ui/docs/architecture/high-level-architecture-simplified.md similarity index 100% rename from tools/server/webui/docs/architecture/high-level-architecture-simplified.md rename to tools/ui/docs/architecture/high-level-architecture-simplified.md diff --git a/tools/server/webui/docs/architecture/high-level-architecture.md b/tools/ui/docs/architecture/high-level-architecture.md similarity index 100% rename from tools/server/webui/docs/architecture/high-level-architecture.md rename to tools/ui/docs/architecture/high-level-architecture.md diff --git a/tools/server/webui/docs/flows/chat-flow.md b/tools/ui/docs/flows/chat-flow.md similarity index 100% rename from tools/server/webui/docs/flows/chat-flow.md rename to tools/ui/docs/flows/chat-flow.md diff --git a/tools/server/webui/docs/flows/conversations-flow.md b/tools/ui/docs/flows/conversations-flow.md similarity index 100% rename from tools/server/webui/docs/flows/conversations-flow.md rename to tools/ui/docs/flows/conversations-flow.md diff --git a/tools/server/webui/docs/flows/data-flow-simplified-model-mode.md b/tools/ui/docs/flows/data-flow-simplified-model-mode.md similarity index 100% rename from tools/server/webui/docs/flows/data-flow-simplified-model-mode.md rename to tools/ui/docs/flows/data-flow-simplified-model-mode.md diff --git a/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md b/tools/ui/docs/flows/data-flow-simplified-router-mode.md similarity index 100% rename from tools/server/webui/docs/flows/data-flow-simplified-router-mode.md rename to tools/ui/docs/flows/data-flow-simplified-router-mode.md diff --git a/tools/server/webui/docs/flows/database-flow.md b/tools/ui/docs/flows/database-flow.md similarity index 100% rename from tools/server/webui/docs/flows/database-flow.md rename to tools/ui/docs/flows/database-flow.md diff --git a/tools/server/webui/docs/flows/mcp-flow.md b/tools/ui/docs/flows/mcp-flow.md similarity index 100% rename from tools/server/webui/docs/flows/mcp-flow.md rename to tools/ui/docs/flows/mcp-flow.md diff --git a/tools/server/webui/docs/flows/models-flow.md b/tools/ui/docs/flows/models-flow.md similarity index 100% rename from tools/server/webui/docs/flows/models-flow.md rename to tools/ui/docs/flows/models-flow.md diff --git a/tools/server/webui/docs/flows/server-flow.md b/tools/ui/docs/flows/server-flow.md similarity index 100% rename from tools/server/webui/docs/flows/server-flow.md rename to tools/ui/docs/flows/server-flow.md diff --git a/tools/server/webui/docs/flows/settings-flow.md b/tools/ui/docs/flows/settings-flow.md similarity index 98% rename from tools/server/webui/docs/flows/settings-flow.md rename to tools/ui/docs/flows/settings-flow.md index 40ad3bd94d7..260713a17b8 100644 --- a/tools/server/webui/docs/flows/settings-flow.md +++ b/tools/ui/docs/flows/settings-flow.md @@ -58,8 +58,8 @@ sequenceDiagram end end - alt serverStore.props has webuiSettings - settingsStore->>settingsStore: Apply webuiSettings from server + alt serverStore.props has uiSettings + settingsStore->>settingsStore: Apply uiSettings from server Note right of settingsStore: Server-provided UI settings
(e.g. showRawOutputSwitch) end diff --git a/tools/server/webui/eslint.config.js b/tools/ui/eslint.config.js similarity index 93% rename from tools/server/webui/eslint.config.js rename to tools/ui/eslint.config.js index cd20fb383a4..185da1dabbe 100644 --- a/tools/server/webui/eslint.config.js +++ b/tools/ui/eslint.config.js @@ -29,7 +29,9 @@ export default ts.config( 'no-undef': 'off', 'svelte/no-at-html-tags': 'off', // This app uses hash-based routing (#/) where resolve() from $app/paths does not apply - 'svelte/no-navigation-without-resolve': 'off' + 'svelte/no-navigation-without-resolve': 'off', + // Enforce empty line at end of file + 'eol-last': 'error' } }, { diff --git a/tools/server/webui/package-lock.json b/tools/ui/package-lock.json similarity index 100% rename from tools/server/webui/package-lock.json rename to tools/ui/package-lock.json diff --git a/tools/server/webui/package.json b/tools/ui/package.json similarity index 98% rename from tools/server/webui/package.json rename to tools/ui/package.json index 2338c38402a..5a1cec66645 100644 --- a/tools/server/webui/package.json +++ b/tools/ui/package.json @@ -5,7 +5,7 @@ "type": "module", "scripts": { "dev": "bash scripts/dev.sh", - "build": "vite build && ./scripts/post-build.sh", + "build": "vite build", "preview": "vite preview", "prepare": "svelte-kit sync || echo ''", "check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json", diff --git a/tools/server/webui/playwright.config.ts b/tools/ui/playwright.config.ts similarity index 72% rename from tools/server/webui/playwright.config.ts rename to tools/ui/playwright.config.ts index 26d3be535d1..d67c3b7e2a6 100644 --- a/tools/server/webui/playwright.config.ts +++ b/tools/ui/playwright.config.ts @@ -2,7 +2,7 @@ import { defineConfig } from '@playwright/test'; export default defineConfig({ webServer: { - command: 'npm run build && http-server ../public -p 8181', + command: 'npm run build && http-server ../../build/tools/ui -p 8181', port: 8181, timeout: 120000, reuseExistingServer: false diff --git a/tools/server/webui/scripts/dev.sh b/tools/ui/scripts/dev.sh similarity index 89% rename from tools/server/webui/scripts/dev.sh rename to tools/ui/scripts/dev.sh index 97c14b87328..9256f255aec 100644 --- a/tools/server/webui/scripts/dev.sh +++ b/tools/ui/scripts/dev.sh @@ -2,14 +2,14 @@ # Development script for llama-ui # -# This script starts the webui development servers (Storybook and Vite). +# This script starts the llama-ui development servers (Storybook and Vite). # Note: You need to start llama-server separately. # # Usage: # bash scripts/dev.sh # npm run dev -cd ../../../ +cd ../../ # Check and install git hooks if missing check_and_install_hooks() { @@ -22,13 +22,13 @@ check_and_install_hooks() { if [ "$hooks_missing" = true ]; then echo "🔧 Git hooks missing, installing them..." - cd tools/server/webui + cd tools/ui if bash scripts/install-git-hooks.sh; then echo "✅ Git hooks installed successfully" else echo "⚠️ Failed to install git hooks, continuing anyway..." fi - cd ../../../ + cd ../../ else echo "✅ Git hooks already installed" fi @@ -48,7 +48,7 @@ trap cleanup SIGINT SIGTERM echo "🚀 Starting development servers..." echo "📝 Note: Make sure to start llama-server separately if needed" -cd tools/server/webui +cd tools/ui # Use --insecure-http-parser to handle malformed HTTP responses from llama-server # (some responses have both Content-Length and Transfer-Encoding headers) storybook dev -p 6006 --ci & NODE_OPTIONS="--insecure-http-parser" vite dev --host 0.0.0.0 & diff --git a/tools/server/webui/scripts/install-git-hooks.sh b/tools/ui/scripts/install-git-hooks.sh similarity index 62% rename from tools/server/webui/scripts/install-git-hooks.sh rename to tools/ui/scripts/install-git-hooks.sh index 8aa1014ba36..213feb08dcf 100755 --- a/tools/server/webui/scripts/install-git-hooks.sh +++ b/tools/ui/scripts/install-git-hooks.sh @@ -1,29 +1,29 @@ #!/bin/bash -# Script to install pre-commit hook for webui -# Pre-commit: formats, checks, and builds webui +# Script to install pre-commit hook for llama-ui +# Pre-commit: formats, checks, and builds the UI app REPO_ROOT=$(git rev-parse --show-toplevel) PRE_COMMIT_HOOK="$REPO_ROOT/.git/hooks/pre-commit" -echo "Installing pre-commit hook for webui..." +echo "Installing pre-commit hook for llama-ui..." # Create the pre-commit hook cat > "$PRE_COMMIT_HOOK" << 'EOF' #!/bin/bash -# Check if there are any changes in the webui directory -if git diff --cached --name-only | grep -q "^tools/server/webui/"; then +# Check if there are any changes in the tools/ui directory +if git diff --cached --name-only | grep -q "^tools/ui/"; then REPO_ROOT=$(git rev-parse --show-toplevel) - cd "$REPO_ROOT/tools/server/webui" + cd "$REPO_ROOT/tools/ui" # Check if package.json exists if [ ! -f "package.json" ]; then - echo "Error: package.json not found in tools/server/webui" + echo "Error: package.json not found in tools/ui" exit 1 fi - echo "Formatting and checking webui code..." + echo "Formatting and checking llama-ui code..." # Run the format command npm run format @@ -46,17 +46,17 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then exit 1 fi - echo "✅ Webui code formatted and checked successfully" + echo "✅ llama-ui code formatted and checked successfully" - # Build the webui - echo "Building webui..." + # Build the llama-ui + echo "Building llama-ui..." npm run build if [ $? -ne 0 ]; then echo "❌ npm run build failed" exit 1 fi - echo "✅ Webui built successfully" + echo "✅ llama-ui built successfully" fi exit 0 @@ -70,8 +70,8 @@ if [ $? -eq 0 ]; then echo " Pre-commit: $PRE_COMMIT_HOOK" echo "" echo "The hook will automatically:" - echo " • Format, lint and check webui code before commits" - echo " • Build webui" + echo " • Format, lint and check llama-ui code before commits" + echo " • Build llama-ui" else echo "❌ Failed to make hook executable" exit 1 diff --git a/tools/server/webui/scripts/vite-plugin-llama-cpp-build.ts b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts similarity index 64% rename from tools/server/webui/scripts/vite-plugin-llama-cpp-build.ts rename to tools/ui/scripts/vite-plugin-llama-cpp-build.ts index 0330a1dda12..5deb0f81054 100644 --- a/tools/server/webui/scripts/vite-plugin-llama-cpp-build.ts +++ b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts @@ -1,4 +1,12 @@ -import { readFileSync, writeFileSync, existsSync, readdirSync, copyFileSync } from 'fs'; +import { + readFileSync, + writeFileSync, + existsSync, + readdirSync, + copyFileSync, + rmSync, + unlinkSync +} from 'fs'; import { resolve } from 'path'; import type { Plugin } from 'vite'; @@ -11,28 +19,28 @@ const GUIDE_FOR_FRONTEND = ` --> `.trim(); +const OUTPUT_DIR = '../../build/tools/ui'; + export function llamaCppBuildPlugin(): Plugin { return { name: 'llamacpp:build', apply: 'build', closeBundle() { - // Ensure the SvelteKit adapter has finished writing to ../public setTimeout(() => { try { - const indexPath = resolve('../public/index.html'); + const outDir = resolve(OUTPUT_DIR); + const indexPath = resolve(outDir, 'index.html'); if (!existsSync(indexPath)) return; let content = readFileSync(indexPath, 'utf-8'); + // Inline favicon as base64 data URL const faviconPath = resolve('static/favicon.svg'); - if (existsSync(faviconPath)) { const faviconContent = readFileSync(faviconPath, 'utf-8'); const faviconBase64 = Buffer.from(faviconContent).toString('base64'); const faviconDataUrl = `data:image/svg+xml;base64,${faviconBase64}`; - content = content.replace(/href="[^"]*favicon\.svg"/g, `href="${faviconDataUrl}"`); - console.log('✓ Inlined favicon.svg as base64 data URL'); } @@ -48,17 +56,16 @@ export function llamaCppBuildPlugin(): Plugin { writeFileSync(indexPath, content, 'utf-8'); console.log('✓ Updated index.html'); - // Copy bundle.*.js -> ../public/bundle.js - const immutableDir = resolve('../public/_app/immutable'); - const bundleDir = resolve('../public/_app/immutable/assets'); + // Copy bundle.*.js -> bundle.js at output root + const immutableDir = resolve(outDir, '_app/immutable'); + const bundleDir = resolve(outDir, '_app/immutable/assets'); if (existsSync(immutableDir)) { const jsFiles = readdirSync(immutableDir).filter((f) => f.match(/^bundle\..+\.js$/)); - if (jsFiles.length > 0) { - copyFileSync(resolve(immutableDir, jsFiles[0]), resolve('../public/bundle.js')); + copyFileSync(resolve(immutableDir, jsFiles[0]), resolve(outDir, 'bundle.js')); // Normalize __sveltekit_ to __sveltekit__ in bundle.js - const bundleJsPath = resolve('../public/bundle.js'); + const bundleJsPath = resolve(outDir, 'bundle.js'); let bundleJs = readFileSync(bundleJsPath, 'utf-8'); bundleJs = bundleJs.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__'); writeFileSync(bundleJsPath, bundleJs, 'utf-8'); @@ -66,17 +73,29 @@ export function llamaCppBuildPlugin(): Plugin { } } - // Copy bundle.*.css -> ../public/bundle.css + // Copy bundle.*.css -> bundle.css at output root if (existsSync(bundleDir)) { const cssFiles = readdirSync(bundleDir).filter((f) => f.match(/^bundle\..+\.css$/)); - if (cssFiles.length > 0) { - copyFileSync(resolve(bundleDir, cssFiles[0]), resolve('../public/bundle.css')); + copyFileSync(resolve(bundleDir, cssFiles[0]), resolve(outDir, 'bundle.css')); console.log(`✓ Copied ${cssFiles[0]} -> bundle.css`); } } + + // Cleanup: remove _app directory, favicon.svg, and legacy index.html.gz + const appDir = resolve(outDir, '_app'); + if (existsSync(appDir)) { + rmSync(appDir, { recursive: true, force: true }); + console.log('✓ Removed _app directory'); + } + + const faviconOut = resolve(outDir, 'favicon.svg'); + if (existsSync(faviconOut)) { + unlinkSync(faviconOut); + console.log('✓ Removed favicon.svg'); + } } catch (error) { - console.error('Failed to update index.html:', error); + console.error('Failed to process build output:', error); } }, 100); } diff --git a/tools/server/webui/src/app.css b/tools/ui/src/app.css similarity index 100% rename from tools/server/webui/src/app.css rename to tools/ui/src/app.css diff --git a/tools/server/webui/src/app.d.ts b/tools/ui/src/app.d.ts similarity index 100% rename from tools/server/webui/src/app.d.ts rename to tools/ui/src/app.d.ts diff --git a/tools/server/webui/src/app.html b/tools/ui/src/app.html similarity index 100% rename from tools/server/webui/src/app.html rename to tools/ui/src/app.html diff --git a/tools/server/webui/src/lib/actions/fade-in-view.svelte.ts b/tools/ui/src/lib/actions/fade-in-view.svelte.ts similarity index 100% rename from tools/server/webui/src/lib/actions/fade-in-view.svelte.ts rename to tools/ui/src/lib/actions/fade-in-view.svelte.ts diff --git a/tools/server/webui/src/lib/components/app/SKILL.md b/tools/ui/src/lib/components/app/SKILL.md similarity index 100% rename from tools/server/webui/src/lib/components/app/SKILL.md rename to tools/ui/src/lib/components/app/SKILL.md diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte b/tools/ui/src/lib/components/app/actions/ActionIcon.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte rename to tools/ui/src/lib/components/app/actions/ActionIcon.svelte diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte b/tools/ui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte rename to tools/ui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte diff --git a/tools/server/webui/src/lib/components/app/actions/index.ts b/tools/ui/src/lib/components/app/actions/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/actions/index.ts rename to tools/ui/src/lib/components/app/actions/index.ts diff --git a/tools/server/webui/src/lib/components/app/badges/BadgeInfo.svelte b/tools/ui/src/lib/components/app/badges/BadgeInfo.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/badges/BadgeInfo.svelte rename to tools/ui/src/lib/components/app/badges/BadgeInfo.svelte diff --git a/tools/server/webui/src/lib/components/app/badges/BadgesModality.svelte b/tools/ui/src/lib/components/app/badges/BadgesModality.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/badges/BadgesModality.svelte rename to tools/ui/src/lib/components/app/badges/BadgesModality.svelte diff --git a/tools/server/webui/src/lib/components/app/badges/index.ts b/tools/ui/src/lib/components/app/badges/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/badges/index.ts rename to tools/ui/src/lib/components/app/badges/index.ts diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsList.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsList.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsList.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsList.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItem.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItem.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItem.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItem.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpResource.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpResource.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpResource.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpResource.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItem.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItem.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItem.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItem.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemAudio.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemAudio.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemAudio.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemAudio.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemImage.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemImage.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemImage.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemImage.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemPdf.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemPdf.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemPdf.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemPdf.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemText.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemText.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemText.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemText.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemUnavailable.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemUnavailable.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemUnavailable.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemUnavailable.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewFileInfo.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewFileInfo.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewFileInfo.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewFileInfo.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewNavButtons.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewNavButtons.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewNavButtons.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewNavButtons.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewThumbnailStrip.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewThumbnailStrip.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewThumbnailStrip.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewThumbnailStrip.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddButton.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddButton.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddButton.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddButton.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte similarity index 97% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte index 175eb3c8c7f..e053e6f836b 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte @@ -54,7 +54,12 @@ } const attachmentMenu = useAttachmentMenu( - () => ({ hasVisionModality, hasAudioModality, hasMcpPromptsSupport, hasMcpResourcesSupport }), + () => ({ + hasVisionModality, + hasAudioModality, + hasMcpPromptsSupport, + hasMcpResourcesSupport + }), () => ({ onFileUpload, onSystemPromptClick, onMcpPromptClick, onMcpResourcesClick }), () => { dropdownOpen = false; diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddMcpServersSubmenu.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddMcpServersSubmenu.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddMcpServersSubmenu.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddMcpServersSubmenu.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte similarity index 98% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte index 99daa10e3e2..4713ec47758 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte @@ -46,7 +46,12 @@ let sheetOpen = $state(false); const attachmentMenu = useAttachmentMenu( - () => ({ hasVisionModality, hasAudioModality, hasMcpPromptsSupport, hasMcpResourcesSupport }), + () => ({ + hasVisionModality, + hasAudioModality, + hasMcpPromptsSupport, + hasMcpResourcesSupport + }), () => ({ onFileUpload, onSystemPromptClick, onMcpPromptClick, onMcpResourcesClick }), () => { sheetOpen = false; diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte similarity index 97% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte index ccc35d98f87..b11467da823 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte @@ -5,6 +5,7 @@ import * as DropdownMenu from '$lib/components/ui/dropdown-menu'; import * as Tooltip from '$lib/components/ui/tooltip'; import { toolsStore } from '$lib/stores/tools.svelte'; + import { CLI_FLAGS } from '$lib/constants'; import { mcpStore } from '$lib/stores/mcp.svelte'; import { useToolsPanel } from '$lib/hooks/use-tools-panel.svelte'; @@ -33,7 +34,7 @@ - Run llama-server with --tools flag to enable + Run llama-server with {CLI_FLAGS.TOOLS} flag to enable Built-in Tools. diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormMcpResourcesList.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormMcpResourcesList.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormMcpResourcesList.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormMcpResourcesList.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerItemHeader.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerItemHeader.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerItemHeader.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerItemHeader.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerList.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerList.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerList.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerList.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItem.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItem.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItem.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItem.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItemSkeleton.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItemSkeleton.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItemSkeleton.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItemSkeleton.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerPopover.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerPopover.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerPopover.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerPopover.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentForm.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentForm.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentForm.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentForm.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentInput.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentInput.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentInput.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentInput.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpResources.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpResources.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpResources.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpResources.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickers.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickers.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickers.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickers.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessage.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessage.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessage.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessage.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageAssistant/ChatMessageAssistant.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageAssistant/ChatMessageAssistant.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageAssistant/ChatMessageAssistant.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageAssistant/ChatMessageAssistant.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPrompt.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPrompt.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPrompt.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPrompt.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPromptContent.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPromptContent.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPromptContent.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPromptContent.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageSystem/ChatMessageSystem.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageSystem/ChatMessageSystem.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageSystem/ChatMessageSystem.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageSystem/ChatMessageSystem.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUser.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUser.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUser.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUser.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserBubble.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserBubble.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserBubble.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserBubble.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserPending.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserPending.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserPending.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserPending.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCard.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCard.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCard.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCard.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardContinueRequest.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardContinueRequest.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardContinueRequest.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardContinueRequest.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardPermissionRequest.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardPermissionRequest.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardPermissionRequest.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardPermissionRequest.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIcons.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIcons.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIcons.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIcons.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIconsBranchingControls.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIconsBranchingControls.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIconsBranchingControls.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIconsBranchingControls.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte similarity index 99% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte index e9b77ba2f4f..3a9cc7e9356 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte @@ -274,7 +274,9 @@ {:else if section.toolResult}
{#each section.parsedLines as line, i (i)} -
{line.text}
+
+ {line.text} +
{#if line.image} (Run
llama-server
with -
--webui-mcp-proxy
+
{CLI_FLAGS.MCP_PROXY}
flag) {/if} diff --git a/tools/server/webui/src/lib/components/app/mcp/McpServerIdentity.svelte b/tools/ui/src/lib/components/app/mcp/McpServerIdentity.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/mcp/McpServerIdentity.svelte rename to tools/ui/src/lib/components/app/mcp/McpServerIdentity.svelte diff --git a/tools/server/webui/src/lib/components/app/mcp/McpServerInfo.svelte b/tools/ui/src/lib/components/app/mcp/McpServerInfo.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/mcp/McpServerInfo.svelte rename to tools/ui/src/lib/components/app/mcp/McpServerInfo.svelte diff --git a/tools/server/webui/src/lib/components/app/mcp/index.ts b/tools/ui/src/lib/components/app/mcp/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/mcp/index.ts rename to tools/ui/src/lib/components/app/mcp/index.ts diff --git a/tools/server/webui/src/lib/components/app/misc/CodeBlockActions.svelte b/tools/ui/src/lib/components/app/misc/CodeBlockActions.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/CodeBlockActions.svelte rename to tools/ui/src/lib/components/app/misc/CodeBlockActions.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte b/tools/ui/src/lib/components/app/misc/ConversationSelection.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte rename to tools/ui/src/lib/components/app/misc/ConversationSelection.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte b/tools/ui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte rename to tools/ui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte b/tools/ui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte rename to tools/ui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/TruncatedText.svelte b/tools/ui/src/lib/components/app/misc/TruncatedText.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/TruncatedText.svelte rename to tools/ui/src/lib/components/app/misc/TruncatedText.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/index.ts b/tools/ui/src/lib/components/app/misc/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/index.ts rename to tools/ui/src/lib/components/app/misc/index.ts diff --git a/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte b/tools/ui/src/lib/components/app/models/ModelBadge.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelBadge.svelte rename to tools/ui/src/lib/components/app/models/ModelBadge.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelId.svelte b/tools/ui/src/lib/components/app/models/ModelId.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelId.svelte rename to tools/ui/src/lib/components/app/models/ModelId.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelectorDropdown.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelsSelectorDropdown.svelte rename to tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelectorList.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorList.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelsSelectorList.svelte rename to tools/ui/src/lib/components/app/models/ModelsSelectorList.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelectorOption.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorOption.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelsSelectorOption.svelte rename to tools/ui/src/lib/components/app/models/ModelsSelectorOption.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelectorSheet.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorSheet.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelsSelectorSheet.svelte rename to tools/ui/src/lib/components/app/models/ModelsSelectorSheet.svelte diff --git a/tools/server/webui/src/lib/components/app/models/index.ts b/tools/ui/src/lib/components/app/models/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/models/index.ts rename to tools/ui/src/lib/components/app/models/index.ts diff --git a/tools/server/webui/src/lib/components/app/models/utils.ts b/tools/ui/src/lib/components/app/models/utils.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/models/utils.ts rename to tools/ui/src/lib/components/app/models/utils.ts diff --git a/tools/server/webui/src/lib/components/app/navigation/DesktopIconStrip.svelte b/tools/ui/src/lib/components/app/navigation/DesktopIconStrip.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/navigation/DesktopIconStrip.svelte rename to tools/ui/src/lib/components/app/navigation/DesktopIconStrip.svelte diff --git a/tools/server/webui/src/lib/components/app/navigation/DropdownMenuActions.svelte b/tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/navigation/DropdownMenuActions.svelte rename to tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte diff --git a/tools/server/webui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte b/tools/ui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte rename to tools/ui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte diff --git a/tools/server/webui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte similarity index 99% rename from tools/server/webui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte rename to tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte index 105576bb4f5..ddaf4d5b875 100644 --- a/tools/server/webui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte +++ b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte @@ -174,7 +174,9 @@
-

{APP_NAME}

+

+ {APP_NAME} +