diff --git a/.abi-check/7.1.0/postgres.symbols.ignore b/.abi-check/7.1.0/postgres.symbols.ignore index 848dbf2841d..d42d77c4039 100644 --- a/.abi-check/7.1.0/postgres.symbols.ignore +++ b/.abi-check/7.1.0/postgres.symbols.ignore @@ -1 +1,12 @@ pgarch_start +ConfigureNamesInt_gp +child_triggers +has_update_triggers +ConfigureNamesBool_gp +aocs_beginscan +AppendOnlyBlockDirectory_GetEntry +ConfigureNamesString_gp +gp_pause_on_restore_point_replay +ConfigureNamesReal_gp +TableAmRoutine +MainLWLockNames diff --git a/.asf.yaml b/.asf.yaml index 06a1287b3ca..01188659355 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -85,6 +85,7 @@ github: # Actions workflows. They do not include the workflow name as a # prefix contexts: + - rat-check - check-skip - Build Apache Cloudberry RPM - RPM Install Test Apache Cloudberry @@ -121,6 +122,18 @@ github: # Require conversation threads to be resolved required_conversation_resolution: true + # Branch protection for REL_2_STABLE release branch + REL_2_STABLE: + # Pull request review requirements + required_pull_request_reviews: + # Require new reviews when new commits are pushed + dismiss_stale_reviews: false + # Require at least 2 approving reviews + required_approving_review_count: 2 + + # Require conversation threads to be resolved + required_conversation_resolution: true + # Branch cleanup settings # Don't automatically delete branches after merging del_branch_on_merge: true diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100644 index 00000000000..ae1651742e0 --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,258 @@ + + +# GitHub Actions Workflows + +This directory contains GitHub Actions workflows for Apache Cloudberry CI/CD. + +## Table of Contents + +- [Available Workflows](#available-workflows) +- [Manual Workflow Triggers](#manual-workflow-triggers) +- [Artifact Reuse for Faster Testing](#artifact-reuse-for-faster-testing) +- [Running Workflows in Forked Repositories](#running-workflows-in-forked-repositories) + +## Available Workflows + +| Workflow | Purpose | Trigger | +|----------|---------|---------| +| `build-cloudberry.yml` | Main CI: build, test, create RPMs | Push, PR, Manual | +| `build-dbg-cloudberry.yml` | Debug build with assertions enabled | Push, PR, Manual | +| `apache-rat-audit.yml` | License header compliance check | Push, PR | +| `coverity.yml` | Static code analysis with Coverity | Weekly, Manual | +| `sonarqube.yml` | Code quality analysis with SonarQube | Push to main | +| `docker-cbdb-build-containers.yml` | Build Docker images for CI | Manual | +| `docker-cbdb-test-containers.yml` | Build test Docker images | Manual | + +## Manual Workflow Triggers + +Many workflows support manual triggering via `workflow_dispatch`, allowing developers to run CI jobs on-demand. + +### How to Manually Trigger a Workflow + +1. Navigate to the **Actions** tab in GitHub +2. Select the workflow from the left sidebar (e.g., "Build and Test Cloudberry") +3. Click **Run workflow** button (top right) +4. Select your branch +5. Configure input parameters (if available) +6. Click **Run workflow** + +### Workflow Input Parameters + +#### `build-cloudberry.yml` - Main CI + +| Parameter | Description | Default | Example | +|-----------|-------------|---------|---------| +| `test_selection` | Comma-separated list of tests to run, or "all" | `all` | `ic-good-opt-off,ic-contrib` | +| `reuse_artifacts_from_run_id` | Run ID to reuse build artifacts from (see below) | _(empty)_ | `12345678901` | + +**Available test selections:** +- `all` - Run all test suites +- `ic-good-opt-off` - Installcheck with optimizer off +- `ic-good-opt-on` - Installcheck with optimizer on +- `ic-contrib` - Contrib extension tests +- `ic-resgroup` - Resource group tests +- `ic-resgroup-v2` - Resource group v2 tests +- `ic-resgroup-v2-memory-accounting` - Resource group memory tests +- `ic-singlenode` - Single-node mode tests +- `make-installcheck-world` - Full test suite +- And more... (see workflow for complete list) + +## Artifact Reuse for Faster Testing + +When debugging test failures, rebuilding Cloudberry (~50-70 minutes) on every iteration is inefficient. The artifact reuse feature allows you to reuse build artifacts from a previous successful run. + +### How It Works + +1. Build artifacts (RPMs, source tarballs) from a previous workflow run are downloaded +2. Build job is skipped (saves ~45-60 minutes) +3. RPM installation test is skipped (saves ~5-10 minutes) +4. Test jobs run with the reused artifacts +5. You can iterate on test configurations without rebuilding + +### Step-by-Step Guide + +#### 1. Find the Run ID + +After a successful build (even if tests failed), get the run ID: + +**Option A: From GitHub Actions UI** +- Go to **Actions** tab → Click on a completed workflow run +- The URL will be: `https://github.com/apache/cloudberry/actions/runs/12345678901` +- The run ID is `12345678901` + +**Option B: From GitHub API** +```bash +# List recent workflow runs +gh run list --workflow=build-cloudberry.yml --limit 5 + +# Get run ID from specific branch +gh run list --workflow=build-cloudberry.yml --branch=my-feature --limit 1 +``` + +#### 2. Trigger New Run with Artifact Reuse + +**Via GitHub UI:** +1. Go to **Actions** → **Build and Test Cloudberry** +2. Click **Run workflow** +3. Enter the run ID in **"Reuse build artifacts from a previous run ID"** +4. Optionally customize **test_selection** +5. Click **Run workflow** + +**Via GitHub CLI:** +```bash +# Reuse artifacts from run 12345678901, run only specific tests +gh workflow run build-cloudberry.yml \ + --field reuse_artifacts_from_run_id=12345678901 \ + --field test_selection=ic-good-opt-off +``` + +#### 3. Monitor Test Execution + +- Build job will be skipped (shows as "Skipped" in Actions UI) +- RPM Install Test will be skipped +- Test jobs will run with artifacts from the specified run ID +- Total time: ~15-30 minutes (vs ~65-100 minutes for full build+test) + +### Use Cases + +**Debugging a specific test failure:** +```bash +# Run 1: Full build + all tests (finds test failure in ic-good-opt-off) +gh workflow run build-cloudberry.yml + +# Get the run ID from output +RUN_ID=$(gh run list --workflow=build-cloudberry.yml --limit 1 --json databaseId --jq '.[0].databaseId') + +# Run 2: Reuse artifacts, run only failing test +gh workflow run build-cloudberry.yml \ + --field reuse_artifacts_from_run_id=$RUN_ID \ + --field test_selection=ic-good-opt-off +``` + +**Testing different configurations:** +```bash +# Test with optimizer off, then on, using same build +gh workflow run build-cloudberry.yml \ + --field reuse_artifacts_from_run_id=$RUN_ID \ + --field test_selection=ic-good-opt-off + +gh workflow run build-cloudberry.yml \ + --field reuse_artifacts_from_run_id=$RUN_ID \ + --field test_selection=ic-good-opt-on +``` + +### Limitations + +- Artifacts expire after 90 days (GitHub default retention) +- Run ID must be from the same repository (or accessible fork) +- Artifacts must include both RPM and source build artifacts +- Cannot reuse artifacts across different OS/architecture combinations +- Changes to source code require a fresh build + +## Running Workflows in Forked Repositories + +GitHub Actions workflows are enabled in forks, allowing you to validate changes before submitting a Pull Request. + +### Initial Setup (One-Time) + +1. **Fork the repository** to your GitHub account + +2. **Enable GitHub Actions** in your fork: + - Go to your fork's **Actions** tab + - Click **"I understand my workflows, go ahead and enable them"** + +**Secrets Configuration:** + +No manual secret configuration is required for the main build and test workflows. + +- `GITHUB_TOKEN` is automatically provided by GitHub and used when downloading artifacts from previous runs (artifact reuse feature) +- DockerHub secrets (`DOCKERHUB_USER`, `DOCKERHUB_TOKEN`) are only required for building custom container images (advanced/maintainer use case, not needed for typical development) + +### Workflow Behavior in Forks + +- ✅ **Automated triggers work**: Push and PR events trigger workflows +- ✅ **Manual triggers work**: `workflow_dispatch` is fully functional +- ✅ **Artifact reuse works**: Can reuse artifacts from previous runs in your fork +- ⚠️ **Cross-fork artifact reuse**: Not supported (security restriction) +- ⚠️ **Some features may be limited**: Certain features requiring organization-level secrets may not work + +### Best Practices for Fork Development + +1. **Test locally first** when possible (faster iteration) +2. **Use manual triggers** to avoid burning GitHub Actions minutes unnecessarily +3. **Use artifact reuse** to iterate on test failures efficiently +4. **Push to feature branches** to trigger automated CI +5. **Review Actions tab** to ensure workflows completed successfully before opening PR + +### Example Fork Workflow + +```bash +# 1. Create feature branch in fork +git checkout -b fix-test-failure + +# 2. Make changes and push to fork +git commit -am "Fix test failure" +git push origin fix-test-failure + +# 3. CI runs automatically on push + +# 4. If tests fail, iterate using artifact reuse +# Get run ID from your fork's Actions tab +gh workflow run build-cloudberry.yml \ + --field reuse_artifacts_from_run_id=12345678901 \ + --field test_selection=ic-good-opt-off + +# 5. Once tests pass, open PR to upstream +gh pr create --web +``` + +## Troubleshooting + +### "Build job was skipped but tests failed to start" + +**Cause:** Artifacts from specified run ID not found or expired + +**Solution:** +- Verify the run ID is correct +- Check that run completed successfully (built artifacts) +- Run a fresh build if artifacts expired (>90 days) + +### "Workflow not found in fork" + +**Cause:** GitHub Actions not enabled in fork + +**Solution:** +- Go to fork's **Actions** tab +- Click to enable workflows + +### "Resource not accessible by integration" + +**Cause:** Workflow trying to access artifacts from different repository + +**Solution:** +- Can only reuse artifacts from same repository +- Run a fresh build in your fork first, then reuse those artifacts + +## Additional Resources + +- [GitHub Actions Documentation](https://docs.github.com/en/actions) +- [Cloudberry Contributing Guide](../../CONTRIBUTING.md) +- [Cloudberry Build Guide](../../deploy/build/README.md) +- [DevOps Scripts](../../devops/README.md) diff --git a/.github/workflows/apache-rat-audit.yml b/.github/workflows/apache-rat-audit.yml new file mode 100644 index 00000000000..4826fc89228 --- /dev/null +++ b/.github/workflows/apache-rat-audit.yml @@ -0,0 +1,347 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Apache Cloudberry (Incubating) Compliance Workflow +# +# Comprehensive compliance checks for Apache Cloudberry: +# 1. Apache RAT license header validation +# 2. Copyright year verification (NOTICE and psql help.c) +# 3. Binary file presence detection with approved allowlist +# +# Based on Apache Rat tool, run locally with: +# `mvn clean verify -Drat.consoleOutput=true` +# -------------------------------------------------------------------- + +name: Apache Rat License Check + +on: + push: + branches: [main, REL_2_STABLE] + pull_request: + branches: [main, REL_2_STABLE] + types: [opened, synchronize, reopened, edited] + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + rat-check: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Check out repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Set up Java and Maven + uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + cache: maven + + - name: Run Apache Rat check + run: | + echo "Running Apache Rat license check..." + mvn clean verify -Drat.consoleOutput=true | tee rat-output.log + + # Check for build failure + if grep -q "\[INFO\] BUILD FAILURE" rat-output.log; then + echo "::error::Apache Rat check failed - build failure detected" + echo "RAT_CHECK=fail" >> $GITHUB_ENV + else + echo "RAT_CHECK=pass" >> $GITHUB_ENV + echo "Apache Rat check passed successfully" + fi + + - name: Check copyright years are up-to-date + run: | + echo "Checking copyright years..." + current_year=$(date -u +"%Y") + echo "CURRENT_YEAR=$current_year" >> $GITHUB_ENV + + # Initialize to pass, will be updated if checks fail + echo "NOTICE_CHECK=pass" >> $GITHUB_ENV + echo "PSQL_HELP_CHECK=pass" >> $GITHUB_ENV + + # Check NOTICE file + echo "Checking NOTICE file..." + if ! grep -q "Copyright 2024-$current_year The Apache Software Foundation" NOTICE; then + echo "::error::NOTICE file does not contain the current year ($current_year)" + echo "NOTICE_CHECK=fail" >> $GITHUB_ENV + else + echo "PASS: NOTICE file contains the current year ($current_year)" + fi + + # Check psql help.c file + echo "Checking src/bin/psql/help.c..." + if ! grep -q "Copyright 2024-$current_year The Apache Software Foundation" src/bin/psql/help.c; then + echo "::error::src/bin/psql/help.c does not contain the current year ($current_year)" + echo "PSQL_HELP_CHECK=fail" >> $GITHUB_ENV + else + echo "PASS: src/bin/psql/help.c contains the current year ($current_year)" + fi + + # Continue execution even if checks fail + if [ "$NOTICE_CHECK" = "pass" ] && [ "$PSQL_HELP_CHECK" = "pass" ]; then + echo "All copyright year checks passed" + else + echo "Copyright year checks completed with errors" + fi + + - name: Check for binary files + run: | + echo "Checking for binary files..." + echo "Checking extensions: class, jar, tar, tgz, zip, exe, dll, so, gz, bz2" + echo "----------------------------------------------------------------------" + + # Binary file allowlist, see README.apache.md + ALLOWLIST=( + "contrib/formatter_fixedwidth/data/fixedwidth_small_correct.tbl.gz" + "gpMgmt/demo/gppkg/sample-sources.tar.gz" + "src/bin/gpfdist/regress/data/exttab1/nation.tbl.gz" + "src/bin/gpfdist/regress/data/gpfdist2/gz_multi_chunk.tbl.gz" + "src/bin/gpfdist/regress/data/gpfdist2/gz_multi_chunk_2.tbl.gz" + "src/bin/gpfdist/regress/data/gpfdist2/lineitem.tbl.bz2" + "src/bin/gpfdist/regress/data/gpfdist2/lineitem.tbl.gz" + ) + + # Check for specific binary file extensions + binary_extensions="class jar tar tgz zip exe dll so gz bz2" + echo "BINARY_EXTENSIONS=${binary_extensions}" >> $GITHUB_ENV + binary_results="" + binaryfiles_found=false + + for extension in ${binary_extensions}; do + printf "Checking *.%-4s files..." "${extension}" + found=$(find . -name "*.${extension}" -type f || true) + + # Filter out allowed files + if [ -n "$found" ]; then + filtered_found="" + while IFS= read -r file; do + is_allowed=false + for allowlist_file in "${ALLOWLIST[@]}"; do + if [ "$file" = "./$allowlist_file" ]; then + is_allowed=true + echo "Allowed: $file" >> binary_allowlist.txt + break + fi + done + if [ "$is_allowed" = false ]; then + filtered_found+="$file"$'\n' + fi + done <<< "$found" + + filtered_found=$(echo "$filtered_found" | sed '/^$/d') + + if [ -n "$filtered_found" ]; then + echo "FOUND" + echo "::error::${extension} files should not exist" + echo "For ASF compatibility: the source tree should not contain" + echo "binary files as users have a hard time verifying their contents." + echo "Found files:" + echo "$filtered_found" | sed 's/^/ /' + echo "${extension}:${filtered_found}" >> binary_results.txt + binaryfiles_found=true + else + echo "NONE (all allowed)" + echo "${extension}:none" >> binary_results.txt + fi + else + echo "NONE" + echo "${extension}:none" >> binary_results.txt + fi + done + + echo "----------------------------------------------------------------------" + if [ "$binaryfiles_found" = true ]; then + echo "ERROR: Non-allowed binary files were found in the source tree" + echo "BINARY_CHECK=fail" >> $GITHUB_ENV + else + echo "PASS: No non-allowed binary files found" + echo "BINARY_CHECK=pass" >> $GITHUB_ENV + fi + + # Show allowlist summary if any allowed files were found + if [ -f binary_allowlist.txt ]; then + echo "" + echo "Allowed binary files (approved):" + cat binary_allowlist.txt | sed 's/^/ /' + fi + + - name: Upload Rat check results + if: always() + uses: actions/upload-artifact@v4 + with: + name: rat-check-results + path: rat-output.log + retention-days: 7 + + - name: Generate Job Summary + if: always() + run: | + { + echo "## Apache Cloudberry Compliance Audit Results" + echo "- Run Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + echo "" + + # Copyright Year Check Summary + echo "### Copyright Year Checks" + echo "**NOTICE file:**" + if [ "$NOTICE_CHECK" = "pass" ]; then + echo "PASS: Contains current year ($CURRENT_YEAR)" + else + echo "ERROR: Does not contain current year ($CURRENT_YEAR)" + fi + echo "" + echo "**psql help.c:**" + if [ "$PSQL_HELP_CHECK" = "pass" ]; then + echo "PASS: Contains current year ($CURRENT_YEAR)" + else + echo "ERROR: Does not contain current year ($CURRENT_YEAR)" + fi + echo "" + + # Binary Files Check Summary + echo "### Binary Files Check" + echo "Checked extensions: \`${BINARY_EXTENSIONS}\`" + echo "" + echo "Results:" + echo "\`\`\`" + if [ -f binary_results.txt ]; then + while IFS=: read -r ext files; do + if [ "$files" = "none" ]; then + echo "PASS: No .${ext} files found" + else + echo "ERROR: Found .${ext} files:" + echo "$files" | sed 's/^/ /' + fi + done < binary_results.txt + fi + echo "\`\`\`" + echo "" + + # Allowlist summary + if [ -f binary_allowlist.txt ]; then + echo "### Allowed Binary Files" + echo "The following binary files are approved for testing purposes:" + echo "You can see [README.apache.md](https://github.com/apache/cloudberry/blob/main/README.apache.md) for details." + echo "\`\`\`" + cat binary_allowlist.txt | sed 's/Allowed: //' + echo "\`\`\`" + echo "" + fi + + # Rat check summary + if [[ -f rat-output.log ]]; then + # First extract and display summary statistics (only once) + if grep -q "Rat check: Summary over all files" rat-output.log; then + echo "### License Header Check" + summary_line=$(grep "Rat check: Summary over all files" rat-output.log) + echo "\`\`\`" + echo "$summary_line" + echo "\`\`\`" + echo "" + fi + + # Then determine the result status + if [ "$RAT_CHECK" = "fail" ]; then + echo "#### Check Failed - License Compliance Issues Detected" + echo "" + + # Extract and display files with unapproved licenses + if grep -q "Files with unapproved licenses:" rat-output.log; then + echo "##### Files with Unapproved Licenses" + echo "\`\`\`" + # Get the line with "Files with unapproved licenses:" and all following lines until the dashed line + sed -n '/Files with unapproved licenses:/,/\[INFO\] ------------------------------------------------------------------------/p' rat-output.log | \ + grep -v "\[INFO\] ------------------------------------------------------------------------" | \ + grep -v "^$" | \ + head -20 + echo "\`\`\`" + echo "" + fi + + echo "**How to fix:**" + echo "" + echo "**For new original files you created:**" + echo "- Add the standard Apache License header to each file" + echo "" + echo "**For third-party files with different licenses:**" + echo "- Add the file to exclusion list in \`pom.xml\` under the rat-maven-plugin configuration" + echo "- Ensure the license is compatible with Apache License 2.0" + echo "- Avoid introducing components with incompatible licenses" + echo "" + echo "**Need help?**" + echo "- Run \`mvn clean verify -Drat.consoleOutput=true\` locally for the full report" + echo "- Email dev@cloudberry.apache.org if you have questions about license compatibility" + + elif [ "$RAT_CHECK" = "pass" ]; then + echo "#### Check Passed - All Files Comply with Apache License Requirements" + fi + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: Report Status + if: always() + shell: bash {0} + run: | + # Check overall status of all checks + overall_status=0 + + # Check Apache RAT status + if [ "$RAT_CHECK" = "fail" ]; then + echo "ERROR: Apache Rat check failed" + overall_status=1 + elif [ "$RAT_CHECK" = "pass" ]; then + echo "Apache Rat check passed" + fi + + # Check copyright year status + if [ -n "$NOTICE_CHECK" ] && [ "$NOTICE_CHECK" = "fail" ]; then + echo "ERROR: NOTICE file copyright year check failed" + overall_status=1 + fi + if [ -n "$PSQL_HELP_CHECK" ] && [ "$PSQL_HELP_CHECK" = "fail" ]; then + echo "ERROR: psql help.c copyright year check failed" + overall_status=1 + fi + + # Check binary files status (if this variable exists) + if [ -n "$BINARY_CHECK" ] && [ "$BINARY_CHECK" = "fail" ]; then + echo "ERROR: Binary files check failed" + overall_status=1 + fi + + # Exit with appropriate status + if [ $overall_status -eq 0 ]; then + echo "SUCCESS: All checks passed" + exit 0 + else + echo "FAILURE: One or more checks failed" + exit 1 + fi \ No newline at end of file diff --git a/.github/workflows/build-cloudberry.yml b/.github/workflows/build-cloudberry.yml index a6e659596b5..9d44d06bbdc 100644 --- a/.github/workflows/build-cloudberry.yml +++ b/.github/workflows/build-cloudberry.yml @@ -102,9 +102,9 @@ name: Apache Cloudberry Build on: push: - branches: [main] + branches: [main, REL_2_STABLE] pull_request: - branches: [main] + branches: [main, REL_2_STABLE] types: [opened, synchronize, reopened, edited] workflow_dispatch: inputs: @@ -113,6 +113,11 @@ on: required: false default: 'all' type: string + reuse_artifacts_from_run_id: + description: 'Reuse build artifacts from a previous run ID (leave empty to build fresh)' + required: false + default: '' + type: string concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -281,18 +286,25 @@ jobs: }, {"test":"ic-contrib", "make_configs":["contrib/auto_explain:installcheck", + "contrib/amcheck:installcheck", "contrib/citext:installcheck", "contrib/btree_gin:installcheck", + "contrib/btree_gist:installcheck", + "contrib/dblink:installcheck", + "contrib/dict_int:installcheck", + "contrib/dict_xsyn:installcheck", + "contrib/extprotocol:installcheck", "contrib/file_fdw:installcheck", "contrib/formatter_fixedwidth:installcheck", - "contrib/extprotocol:installcheck", - "contrib/dblink:installcheck", + "contrib/hstore:installcheck", + "contrib/indexscan:installcheck", "contrib/pg_trgm:installcheck", "contrib/indexscan:installcheck", - "contrib/hstore:installcheck", "contrib/pgcrypto:installcheck", + "contrib/pgstattuple:installcheck", "contrib/tablefunc:installcheck", "contrib/passwordcheck:installcheck", + "contrib/pg_buffercache:installcheck", "contrib/sslinfo:installcheck"] }, {"test":"ic-gpcontrib", @@ -309,12 +321,18 @@ jobs: {"test":"ic-isolation2", "make_configs":["src/test/isolation2:installcheck-isolation2"] }, + {"test":"ic-isolation2-hot-standby", + "make_configs":["src/test/isolation2:installcheck-hot-standby"] + }, {"test":"ic-isolation2-crash", "make_configs":["src/test/isolation2:installcheck-isolation2-crash"], "enable_core_check":false }, {"test":"ic-parallel-retrieve-cursor", "make_configs":["src/test/isolation2:installcheck-parallel-retrieve-cursor"] + }, + {"test":"ic-cbdb-parallel", + "make_configs":["src/test/regress:installcheck-cbdb-parallel"] } ] }' @@ -406,6 +424,7 @@ jobs: needs: [check-skip] runs-on: ubuntu-22.04 timeout-minutes: 120 + if: github.event.inputs.reuse_artifacts_from_run_id == '' outputs: build_timestamp: ${{ steps.set_timestamp.outputs.timestamp }} @@ -414,8 +433,33 @@ jobs: options: >- --user root -h cdw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt steps: + - name: Free Disk Space + if: needs.check-skip.outputs.should_skip != 'true' + run: | + echo "=== Disk space before cleanup ===" + df -h / + + # Remove pre-installed tools from host to free disk space + rm -rf /host_opt/hostedtoolcache || true # GitHub Actions tool cache + rm -rf /host_usr_local/lib/android || true # Android SDK + rm -rf /host_usr_share/dotnet || true # .NET SDK + rm -rf /host_opt/ghc || true # Haskell GHC + rm -rf /host_usr_local/.ghcup || true # Haskell GHCup + rm -rf /host_usr_share/swift || true # Swift + rm -rf /host_usr_local/share/powershell || true # PowerShell + rm -rf /host_usr_local/share/chromium || true # Chromium + rm -rf /host_usr_share/miniconda || true # Miniconda + rm -rf /host_opt/az || true # Azure CLI + rm -rf /host_usr_share/sbt || true # Scala Build Tool + + echo "=== Disk space after cleanup ===" + df -h / + - name: Skip Check if: needs.check-skip.outputs.should_skip == 'true' run: | @@ -437,24 +481,6 @@ jobs: fetch-depth: 1 submodules: true - - name: Checkout CI Build/Test Scripts - if: needs.check-skip.outputs.should_skip != 'true' - uses: actions/checkout@v4 - with: - repository: apache/cloudberry-devops-release - ref: main - path: cloudberry-devops-release - fetch-depth: 1 - - - name: Move cloudberry-devops-release directory - if: needs.check-skip.outputs.should_skip != 'true' - run: | - set -eo pipefail - if ! mv "${GITHUB_WORKSPACE}"/cloudberry-devops-release "${GITHUB_WORKSPACE}"/..; then - echo "::error::Container initialization failed" - exit 1 - fi - - name: Cloudberry Environment Initialization if: needs.check-skip.outputs.should_skip != 'true' env: @@ -506,8 +532,8 @@ jobs: SRC_DIR: ${{ github.workspace }} run: | set -eo pipefail - chmod +x "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/configure-cloudberry.sh - if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ENABLE_DEBUG=${{ env.ENABLE_DEBUG }} ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/configure-cloudberry.sh"; then + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ENABLE_DEBUG=${{ env.ENABLE_DEBUG }} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh"; then echo "::error::Configure script failed" exit 1 fi @@ -519,8 +545,8 @@ jobs: run: | set -eo pipefail - chmod +x "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/build-cloudberry.sh - if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/build-cloudberry.sh"; then + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/build-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/build-cloudberry.sh"; then echo "::error::Build script failed" exit 1 fi @@ -602,7 +628,7 @@ jobs: # Create RPM echo "Creating RPM package..." rpmdev-setuptree - ln -s "${SRC_DIR}"/../cloudberry-devops-release/packaging/rpm/el/SPECS/apache-cloudberry-db-incubating.spec "${HOME}"/rpmbuild/SPECS/apache-cloudberry-db-incubating.spec + ln -s "${SRC_DIR}"/devops/build/packaging/rpm/apache-cloudberry-db-incubating.spec "${HOME}"/rpmbuild/SPECS/apache-cloudberry-db-incubating.spec cp "${SRC_DIR}"/LICENSE /usr/local/cloudberry-db DEBUG_RPMBUILD_OPT="" @@ -612,7 +638,7 @@ jobs: DEBUG_IDENTIFIER=".debug" fi - "${SRC_DIR}"/../cloudberry-devops-release/scripts/build-rpm.sh --version "${CBDB_VERSION}" --release "${BUILD_NUMBER}" "${DEBUG_RPMBUILD_OPT}" + "${SRC_DIR}"/devops/build/packaging/rpm/build-rpm.sh --version "${CBDB_VERSION}" --release "${BUILD_NUMBER}" "${DEBUG_RPMBUILD_OPT}" # Get OS version and move RPM os_version=$(grep -oP '(?<=^VERSION_ID=")[0-9]' /etc/os-release) @@ -649,8 +675,8 @@ jobs: SRC_DIR: ${{ github.workspace }} run: | set -eo pipefail - chmod +x "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/unittest-cloudberry.sh - if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/unittest-cloudberry.sh"; then + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/unittest-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/unittest-cloudberry.sh"; then echo "::error::Unittest script failed" exit 1 fi @@ -699,6 +725,10 @@ jobs: rpm-install-test: name: RPM Install Test Apache Cloudberry needs: [check-skip, build] + if: | + !cancelled() && + (needs.build.result == 'success' || needs.build.result == 'skipped') && + github.event.inputs.reuse_artifacts_from_run_id == '' runs-on: ubuntu-22.04 timeout-minutes: 120 @@ -707,8 +737,33 @@ jobs: options: >- --user root -h cdw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt steps: + - name: Free Disk Space + if: needs.check-skip.outputs.should_skip != 'true' + run: | + echo "=== Disk space before cleanup ===" + df -h / + + # Remove pre-installed tools from host to free disk space + rm -rf /host_opt/hostedtoolcache || true # GitHub Actions tool cache + rm -rf /host_usr_local/lib/android || true # Android SDK + rm -rf /host_usr_share/dotnet || true # .NET SDK + rm -rf /host_opt/ghc || true # Haskell GHC + rm -rf /host_usr_local/.ghcup || true # Haskell GHCup + rm -rf /host_usr_share/swift || true # Swift + rm -rf /host_usr_local/share/powershell || true # PowerShell + rm -rf /host_usr_local/share/chromium || true # Chromium + rm -rf /host_usr_share/miniconda || true # Miniconda + rm -rf /host_opt/az || true # Azure CLI + rm -rf /host_usr_share/sbt || true # Scala Build Tool + + echo "=== Disk space after cleanup ===" + df -h / + - name: Skip Check if: needs.check-skip.outputs.should_skip == 'true' run: | @@ -722,6 +777,8 @@ jobs: name: apache-cloudberry-db-incubating-rpm-build-artifacts path: ${{ github.workspace }}/rpm_build_artifacts merge-multiple: false + run-id: ${{ github.event.inputs.reuse_artifacts_from_run_id || github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} - name: Cloudberry Environment Initialization if: needs.check-skip.outputs.should_skip != 'true' @@ -826,12 +883,18 @@ jobs: echo "Version: ${RPM_VERSION}" echo "Release: ${RPM_RELEASE}" + # Refresh repository metadata to avoid mirror issues + echo "Refreshing repository metadata..." + dnf clean all + dnf makecache --refresh || dnf makecache + # Clean install location rm -rf /usr/local/cloudberry-db - # Install RPM + # Install RPM with retry logic for mirror issues + # Use --releasever=9 to pin to stable Rocky Linux 9 repos (not bleeding-edge 9.6) echo "Starting installation..." - if ! time dnf install -y "${RPM_FILE}"; then + if ! time dnf install -y --setopt=retries=10 --releasever=9 "${RPM_FILE}"; then echo "::error::RPM installation failed" exit 1 fi @@ -870,6 +933,9 @@ jobs: test: name: ${{ matrix.test }} needs: [check-skip, build, prepare-test-matrix] + if: | + !cancelled() && + (needs.build.result == 'success' || needs.build.result == 'skipped') runs-on: ubuntu-22.04 timeout-minutes: 120 # actionlint-allow matrix[*].pg_settings @@ -887,8 +953,33 @@ jobs: --ulimit core=-1 --cgroupns=host -v /sys/fs/cgroup:/sys/fs/cgroup:rw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt steps: + - name: Free Disk Space + if: needs.check-skip.outputs.should_skip != 'true' + run: | + echo "=== Disk space before cleanup ===" + df -h / + + # Remove pre-installed tools from host to free disk space + rm -rf /host_opt/hostedtoolcache || true # GitHub Actions tool cache + rm -rf /host_usr_local/lib/android || true # Android SDK + rm -rf /host_usr_share/dotnet || true # .NET SDK + rm -rf /host_opt/ghc || true # Haskell GHC + rm -rf /host_usr_local/.ghcup || true # Haskell GHCup + rm -rf /host_usr_share/swift || true # Swift + rm -rf /host_usr_local/share/powershell || true # PowerShell + rm -rf /host_usr_local/share/chromium || true # Chromium + rm -rf /host_usr_share/miniconda || true # Miniconda + rm -rf /host_opt/az || true # Azure CLI + rm -rf /host_usr_share/sbt || true # Scala Build Tool + + echo "=== Disk space after cleanup ===" + df -h / + - name: Skip Check if: needs.check-skip.outputs.should_skip == 'true' run: | @@ -900,24 +991,6 @@ jobs: run: | echo "Timestamp from output: ${{ needs.build.outputs.build_timestamp }}" - - name: Checkout CI Build/Test Scripts - if: needs.check-skip.outputs.should_skip != 'true' - uses: actions/checkout@v4 - with: - repository: apache/cloudberry-devops-release - ref: main - path: cloudberry-devops-release - fetch-depth: 1 - - - name: Move cloudberry-devops-release directory - if: needs.check-skip.outputs.should_skip != 'true' - run: | - set -eo pipefail - if ! mv "${GITHUB_WORKSPACE}"/cloudberry-devops-release "${GITHUB_WORKSPACE}"/..; then - echo "::error::Container initialization failed" - exit 1 - fi - - name: Cloudberry Environment Initialization env: LOGS_DIR: build-logs @@ -1117,6 +1190,8 @@ jobs: name: apache-cloudberry-db-incubating-rpm-build-artifacts path: ${{ github.workspace }}/rpm_build_artifacts merge-multiple: false + run-id: ${{ github.event.inputs.reuse_artifacts_from_run_id || github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} - name: Download Cloudberry Source build artifacts if: needs.check-skip.outputs.should_skip != 'true' @@ -1125,6 +1200,8 @@ jobs: name: apache-cloudberry-db-incubating-source-build-artifacts path: ${{ github.workspace }}/source_build_artifacts merge-multiple: false + run-id: ${{ github.event.inputs.reuse_artifacts_from_run_id || github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} - name: Verify downloaded artifacts if: needs.check-skip.outputs.should_skip != 'true' @@ -1216,12 +1293,18 @@ jobs: echo "Version: ${RPM_VERSION}" echo "Release: ${RPM_RELEASE}" + # Refresh repository metadata to avoid mirror issues + echo "Refreshing repository metadata..." + dnf clean all + dnf makecache --refresh || dnf makecache + # Clean install location rm -rf /usr/local/cloudberry-db - # Install RPM + # Install RPM with retry logic for mirror issues + # Use --releasever=9 to pin to stable Rocky Linux 9 repos (not bleeding-edge 9.6) echo "Starting installation..." - if ! time dnf install -y "${RPM_FILE}"; then + if ! time dnf install -y --setopt=retries=10 --releasever=9 "${RPM_FILE}"; then echo "::error::RPM installation failed" exit 1 fi @@ -1230,6 +1313,22 @@ jobs: rpm -qi apache-cloudberry-db-incubating } 2>&1 | tee -a build-logs/details/rpm-installation.log + # Clean up downloaded RPM artifacts to free disk space + echo "=== Disk space before RPM cleanup ===" + echo "Human readable:" + df -kh / + echo "Exact KB:" + df -k / + echo "RPM artifacts size:" + du -sh "${GITHUB_WORKSPACE}"/rpm_build_artifacts || true + echo "Cleaning up RPM artifacts to free disk space..." + rm -rf "${GITHUB_WORKSPACE}"/rpm_build_artifacts + echo "=== Disk space after RPM cleanup ===" + echo "Human readable:" + df -kh / + echo "Exact KB:" + df -k / + - name: Extract source tarball if: success() && needs.check-skip.outputs.should_skip != 'true' env: @@ -1255,6 +1354,22 @@ jobs: du -sh "${SRC_DIR}/../cloudberry" } 2>&1 | tee -a build-logs/details/source-extraction.log + # Clean up source tarball to free disk space + echo "=== Disk space before source tarball cleanup ===" + echo "Human readable:" + df -kh / + echo "Exact KB:" + df -k / + echo "Source tarball artifacts size:" + du -sh "${GITHUB_WORKSPACE}"/source_build_artifacts || true + echo "Cleaning up source tarball to free disk space..." + rm -rf "${GITHUB_WORKSPACE}"/source_build_artifacts + echo "=== Disk space after source tarball cleanup ===" + echo "Human readable:" + df -kh / + echo "Exact KB:" + df -k / + - name: Create Apache Cloudberry demo cluster if: success() && needs.check-skip.outputs.should_skip != 'true' env: @@ -1263,8 +1378,8 @@ jobs: set -eo pipefail { - chmod +x "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh - if ! time su - gpadmin -c "cd ${SRC_DIR} && NUM_PRIMARY_MIRROR_PAIRS='${{ matrix.num_primary_mirror_pairs }}' SRC_DIR=${SRC_DIR} ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh"; then + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && NUM_PRIMARY_MIRROR_PAIRS='${{ matrix.num_primary_mirror_pairs }}' SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh"; then echo "::error::Demo cluster creation failed" exit 1 fi @@ -1339,7 +1454,7 @@ jobs: MAKE_DIRECTORY='-C $dir' \ PGOPTIONS='${PG_OPTS}' \ SRC_DIR='${SRC_DIR}' \ - ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/test-cloudberry.sh" \ + ${SRC_DIR}/devops/build/automation/cloudberry/scripts/test-cloudberry.sh" \ 2>&1 | tee "$config_log"; then echo "::warning::Test execution failed for configuration $((i+1)): make -C $dir $target" overall_status=1 @@ -1372,7 +1487,7 @@ jobs: ls -Rl "/tmp/cloudberry-cores" echo "-----------------------------------------" - "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/analyze_core_dumps.sh "$test_id" + "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/analyze_core_dumps.sh "$test_id" core_analysis_rc=$? case "$core_analysis_rc" in 0) echo "No core dumps found for this configuration" ;; @@ -1448,7 +1563,7 @@ jobs: # Parse this configuration's results MAKE_NAME="${{ matrix.test }}-config$i" \ - "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/parse-test-results.sh "$config_log" + "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/parse-test-results.sh "$config_log" status_code=$? { diff --git a/.github/workflows/build-dbg-cloudberry.yml b/.github/workflows/build-dbg-cloudberry.yml index 998242efcd7..967fc259f0b 100644 --- a/.github/workflows/build-dbg-cloudberry.yml +++ b/.github/workflows/build-dbg-cloudberry.yml @@ -102,9 +102,9 @@ name: Apache Cloudberry Build Debug on: push: - branches: [main] + branches: [main, REL_2_STABLE] pull_request: - branches: [main] + branches: [main, REL_2_STABLE] types: [opened, synchronize, reopened, edited] workflow_dispatch: inputs: @@ -222,9 +222,6 @@ jobs: # Define base test configurations ALL_TESTS='{ "include": [ - {"test":"ic-cbdb-parallel", - "make_configs":["src/test/regress:installcheck-cbdb-parallel"] - } ] }' @@ -346,24 +343,6 @@ jobs: fetch-depth: 1 submodules: true - - name: Checkout CI Build/Test Scripts - if: needs.check-skip.outputs.should_skip != 'true' - uses: actions/checkout@v4 - with: - repository: apache/cloudberry-devops-release - ref: main - path: cloudberry-devops-release - fetch-depth: 1 - - - name: Move cloudberry-devops-release directory - if: needs.check-skip.outputs.should_skip != 'true' - run: | - set -eo pipefail - if ! mv "${GITHUB_WORKSPACE}"/cloudberry-devops-release "${GITHUB_WORKSPACE}"/..; then - echo "::error::Container initialization failed" - exit 1 - fi - - name: Cloudberry Environment Initialization if: needs.check-skip.outputs.should_skip != 'true' env: @@ -415,8 +394,8 @@ jobs: SRC_DIR: ${{ github.workspace }} run: | set -eo pipefail - chmod +x "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/configure-cloudberry.sh - if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ENABLE_DEBUG=${{ env.ENABLE_DEBUG }} ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/configure-cloudberry.sh"; then + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ENABLE_DEBUG=${{ env.ENABLE_DEBUG }} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh"; then echo "::error::Configure script failed" exit 1 fi @@ -428,8 +407,8 @@ jobs: run: | set -eo pipefail - chmod +x "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/build-cloudberry.sh - if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/build-cloudberry.sh"; then + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/build-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/build-cloudberry.sh"; then echo "::error::Build script failed" exit 1 fi @@ -511,7 +490,7 @@ jobs: # Create RPM echo "Creating RPM package..." rpmdev-setuptree - ln -s "${SRC_DIR}"/../cloudberry-devops-release/packaging/rpm/el/SPECS/apache-cloudberry-db-incubating.spec "${HOME}"/rpmbuild/SPECS/apache-cloudberry-db-incubating.spec + ln -s "${SRC_DIR}"/devops/build/packaging/rpm/apache-cloudberry-db-incubating.spec "${HOME}"/rpmbuild/SPECS/apache-cloudberry-db-incubating.spec cp "${SRC_DIR}"/LICENSE /usr/local/cloudberry-db DEBUG_RPMBUILD_OPT="" @@ -521,7 +500,7 @@ jobs: DEBUG_IDENTIFIER=".debug" fi - "${SRC_DIR}"/../cloudberry-devops-release/scripts/build-rpm.sh --version "${CBDB_VERSION}" --release "${BUILD_NUMBER}" "${DEBUG_RPMBUILD_OPT}" + "${SRC_DIR}"/devops/build/packaging/rpm/build-rpm.sh --version "${CBDB_VERSION}" --release "${BUILD_NUMBER}" "${DEBUG_RPMBUILD_OPT}" # Get OS version and move RPM os_version=$(grep -oP '(?<=^VERSION_ID=")[0-9]' /etc/os-release) @@ -556,8 +535,8 @@ jobs: SRC_DIR: ${{ github.workspace }} run: | set -eo pipefail - chmod +x "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/unittest-cloudberry.sh - if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/unittest-cloudberry.sh"; then + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/unittest-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/unittest-cloudberry.sh"; then echo "::error::Unittest script failed" exit 1 fi @@ -807,24 +786,6 @@ jobs: run: | echo "Timestamp from output: ${{ needs.build.outputs.build_timestamp }}" - - name: Checkout CI Build/Test Scripts - if: needs.check-skip.outputs.should_skip != 'true' - uses: actions/checkout@v4 - with: - repository: apache/cloudberry-devops-release - ref: main - path: cloudberry-devops-release - fetch-depth: 1 - - - name: Move cloudberry-devops-release directory - if: needs.check-skip.outputs.should_skip != 'true' - run: | - set -eo pipefail - if ! mv "${GITHUB_WORKSPACE}"/cloudberry-devops-release "${GITHUB_WORKSPACE}"/..; then - echo "::error::Container initialization failed" - exit 1 - fi - - name: Cloudberry Environment Initialization env: LOGS_DIR: build-logs @@ -1170,8 +1131,8 @@ jobs: set -eo pipefail { - chmod +x "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh - if ! time su - gpadmin -c "cd ${SRC_DIR} && NUM_PRIMARY_MIRROR_PAIRS='${{ matrix.num_primary_mirror_pairs }}' SRC_DIR=${SRC_DIR} ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh"; then + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && NUM_PRIMARY_MIRROR_PAIRS='${{ matrix.num_primary_mirror_pairs }}' SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh"; then echo "::error::Demo cluster creation failed" exit 1 fi @@ -1242,7 +1203,7 @@ jobs: MAKE_DIRECTORY='-C $dir' \ PGOPTIONS='${PG_OPTS}' \ SRC_DIR='${SRC_DIR}' \ - ${SRC_DIR}/../cloudberry-devops-release/build_automation/cloudberry/scripts/test-cloudberry.sh" \ + ${SRC_DIR}/devops/build/automation/cloudberry/scripts/test-cloudberry.sh" \ 2>&1 | tee "$config_log"; then echo "::warning::Test execution failed for configuration $((i+1)): make -C $dir $target" overall_status=1 @@ -1275,7 +1236,7 @@ jobs: ls -Rl "/tmp/cloudberry-cores" echo "-----------------------------------------" - "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/analyze_core_dumps.sh "$test_id" + "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/analyze_core_dumps.sh "$test_id" core_analysis_rc=$? case "$core_analysis_rc" in 0) echo "No core dumps found for this configuration" ;; @@ -1351,7 +1312,7 @@ jobs: # Parse this configuration's results MAKE_NAME="${{ matrix.test }}-config$i" \ - "${SRC_DIR}"/../cloudberry-devops-release/build_automation/cloudberry/scripts/parse-test-results.sh "$config_log" + "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/parse-test-results.sh "$config_log" status_code=$? { diff --git a/.github/workflows/build-deb-cloudberry.yml b/.github/workflows/build-deb-cloudberry.yml new file mode 100644 index 00000000000..be28fff9e77 --- /dev/null +++ b/.github/workflows/build-deb-cloudberry.yml @@ -0,0 +1,1841 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# GitHub Actions Workflow: Apache Cloudberry Build Pipeline +# -------------------------------------------------------------------- +# Description: +# +# This workflow builds, tests, and packages Apache Cloudberry on +# Ubuntu 22.04. It ensures artifact integrity and performs installation +# tests. +# +# Workflow Overview: +# 1. **Build Job**: +# - Configures and builds Apache Cloudberry. +# - Supports debug build configuration via ENABLE_DEBUG flag. +# - Runs unit tests and verifies build artifacts. +# - Creates DEB packages (regular and debug), source tarball +# and additional files for dupload utility. +# - **Key Artifacts**: DEB package, source tarball, changes and dsc files, build logs. +# +# 2. **DEB Install Test Job**: +# - Verifies DEB integrity and installs Cloudberry. +# - Validates successful installation. +# - **Key Artifacts**: Installation logs, verification results. +# +# 3. **Report Job**: +# - Aggregates job results into a final report. +# - Sends failure notifications if any step fails. +# +# Execution Environment: +# - **Runs On**: ubuntu-22.04 with ubuntu-22.04 containers. +# - **Resource Requirements**: +# - Disk: Minimum 20GB free space. +# - Memory: Minimum 8GB RAM. +# - CPU: Recommended 4+ cores. +# +# Triggers: +# - Push to `main` branch. +# - Pull requests to `main` branch. +# - Manual workflow dispatch. +# +# Container Images: +# - **Build**: `apache/incubator-cloudberry:cbdb-build-ubuntu22.04-latest` +# - **Test**: `apache/incubator-cloudberry:cbdb-test-ubuntu22.04-latest` +# +# Artifacts: +# - DEB Package (retention: ${{ env.LOG_RETENTION_DAYS }} days). +# - Changes and DSC files (retention: ${{ env.LOG_RETENTION_DAYS }} days). +# - Source Tarball (retention: ${{ env.LOG_RETENTION_DAYS }} days). +# - Logs and Test Results (retention: ${{ env.LOG_RETENTION_DAYS }} days). +# +# Notes: +# - Supports concurrent job execution. +# - Supports debug builds with preserved symbols. +# -------------------------------------------------------------------- + +name: Apache Cloudberry Debian Build + +on: + push: + branches: [main, REL_2_STABLE] + pull_request: + branches: [main, REL_2_STABLE] + types: [opened, synchronize, reopened, edited] + workflow_dispatch: # Manual trigger + inputs: + test_selection: + description: 'Select tests to run (comma-separated). Examples: ic-good-opt-off,ic-contrib' + required: false + default: 'all' + type: string + reuse_artifacts_from_run_id: + description: 'Reuse build artifacts from a previous run ID (leave empty to build fresh)' + required: false + default: '' + type: string + +# Note: Step details, logs, and artifacts require users to be logged into GitHub +# even for public repositories. This is a GitHub security feature and cannot +# be overridden by permissions. + +permissions: + # READ permissions allow viewing repository contents + contents: read # Required for checking out code and reading repository files + + # READ permissions for packages (Container registry, etc) + packages: read # Allows reading from GitHub package registry + + # WRITE permissions for actions includes read access to: + # - Workflow runs + # - Artifacts (requires GitHub login) + # - Logs (requires GitHub login) + actions: write + + # READ permissions for checks API: + # - Step details visibility (requires GitHub login) + # - Check run status and details + checks: read + + # READ permissions for pull request metadata: + # - PR status + # - Associated checks + # - Review states + pull-requests: read + +env: + LOG_RETENTION_DAYS: 7 + ENABLE_DEBUG: false + +jobs: + + ## ====================================================================== + ## Job: check-skip + ## ====================================================================== + + check-skip: + runs-on: ubuntu-22.04 + outputs: + should_skip: ${{ steps.skip-check.outputs.should_skip }} + steps: + - id: skip-check + shell: bash + env: + EVENT_NAME: ${{ github.event_name }} + PR_TITLE: ${{ github.event.pull_request.title || '' }} + PR_BODY: ${{ github.event.pull_request.body || '' }} + run: | + # Default to not skipping + echo "should_skip=false" >> "$GITHUB_OUTPUT" + + # Apply skip logic only for pull_request events + if [[ "$EVENT_NAME" == "pull_request" ]]; then + # Combine PR title and body for skip check + MESSAGE="${PR_TITLE}\n${PR_BODY}" + + # Escape special characters using printf %s + ESCAPED_MESSAGE=$(printf "%s" "$MESSAGE") + + echo "Checking PR title and body (escaped): $ESCAPED_MESSAGE" + + # Check for skip patterns + if echo -e "$ESCAPED_MESSAGE" | grep -qEi '\[skip[ -]ci\]|\[ci[ -]skip\]|\[no[ -]ci\]'; then + echo "should_skip=true" >> "$GITHUB_OUTPUT" + fi + else + echo "Skip logic is not applied for $EVENT_NAME events." + fi + + - name: Report Skip Status + if: steps.skip-check.outputs.should_skip == 'true' + run: | + echo "CI Skip flag detected in PR - skipping all checks." + exit 0 + + ## ====================================================================== + ## Job: prepare-test-matrix-deb + ## ====================================================================== + + prepare-test-matrix-deb: + runs-on: ubuntu-22.04 + needs: [check-skip] + if: needs.check-skip.outputs.should_skip != 'true' + outputs: + test-matrix: ${{ steps.set-matrix.outputs.matrix }} + + steps: + - id: set-matrix + run: | + echo "=== Matrix Preparation Diagnostics ===" + echo "Event type: ${{ github.event_name }}" + echo "Test selection input: '${{ github.event.inputs.test_selection }}'" + + # Define defaults + DEFAULT_NUM_PRIMARY_MIRROR_PAIRS=3 + DEFAULT_ENABLE_CGROUPS=false + DEFAULT_ENABLE_CORE_CHECK=true + DEFAULT_PG_SETTINGS_OPTIMIZER="" + + # Define base test configurations + ALL_TESTS='{ + "include": [ + {"test":"ic-deb-good-opt-off", + "make_configs":["src/test/regress:installcheck-good"], + "pg_settings":{"optimizer":"off"} + }, + {"test":"ic-deb-good-opt-on", + "make_configs":["src/test/regress:installcheck-good"], + "pg_settings":{"optimizer":"on"} + }, + {"test":"pax-ic-deb-good-opt-off", + "make_configs":[ + "contrib/pax_storage/:pax-test", + "contrib/pax_storage/:regress_test" + ], + "pg_settings":{ + "optimizer":"off", + "default_table_access_method":"pax" + } + }, + {"test":"pax-ic-deb-good-opt-on", + "make_configs":[ + "contrib/pax_storage/:pax-test", + "contrib/pax_storage/:regress_test" + ], + "pg_settings":{ + "optimizer":"on", + "default_table_access_method":"pax" + } + }, + {"test":"ic-deb-contrib", + "make_configs":["contrib/auto_explain:installcheck", + "contrib/amcheck:installcheck", + "contrib/citext:installcheck", + "contrib/btree_gin:installcheck", + "contrib/btree_gist:installcheck", + "contrib/dblink:installcheck", + "contrib/dict_int:installcheck", + "contrib/dict_xsyn:installcheck", + "contrib/extprotocol:installcheck", + "contrib/file_fdw:installcheck", + "contrib/formatter_fixedwidth:installcheck", + "contrib/hstore:installcheck", + "contrib/indexscan:installcheck", + "contrib/pg_trgm:installcheck", + "contrib/indexscan:installcheck", + "contrib/pgcrypto:installcheck", + "contrib/pgstattuple:installcheck", + "contrib/tablefunc:installcheck", + "contrib/passwordcheck:installcheck", + "contrib/pg_buffercache:installcheck", + "contrib/sslinfo:installcheck"] + }, + {"test":"ic-deb-gpcontrib", + "make_configs":["gpcontrib/orafce:installcheck", + "gpcontrib/pxf_fdw:installcheck", + "gpcontrib/zstd:installcheck", + "gpcontrib/gp_sparse_vector:installcheck", + "gpcontrib/gp_toolkit:installcheck"] + }, + {"test":"ic-cbdb-parallel", + "make_configs":["src/test/regress:installcheck-cbdb-parallel"] + } + ] + }' + + # Function to apply defaults + apply_defaults() { + echo "$1" | jq --arg npm "$DEFAULT_NUM_PRIMARY_MIRROR_PAIRS" \ + --argjson ec "$DEFAULT_ENABLE_CGROUPS" \ + --argjson ecc "$DEFAULT_ENABLE_CORE_CHECK" \ + --arg opt "$DEFAULT_PG_SETTINGS_OPTIMIZER" \ + 'def get_defaults: + { + num_primary_mirror_pairs: ($npm|tonumber), + enable_cgroups: $ec, + enable_core_check: $ecc, + pg_settings: { + optimizer: $opt + } + }; + get_defaults * .' + } + + # Extract all valid test names from ALL_TESTS + VALID_TESTS=$(echo "$ALL_TESTS" | jq -r '.include[].test') + + # Parse input test selection + IFS=',' read -ra SELECTED_TESTS <<< "${{ github.event.inputs.test_selection }}" + + # Default to all tests if selection is empty or 'all' + if [[ "${SELECTED_TESTS[*]}" == "all" || -z "${SELECTED_TESTS[*]}" ]]; then + mapfile -t SELECTED_TESTS <<< "$VALID_TESTS" + fi + + # Validate and filter selected tests + INVALID_TESTS=() + FILTERED_TESTS=() + for TEST in "${SELECTED_TESTS[@]}"; do + TEST=$(echo "$TEST" | tr -d '[:space:]') # Trim whitespace + if echo "$VALID_TESTS" | grep -qw "$TEST"; then + FILTERED_TESTS+=("$TEST") + else + INVALID_TESTS+=("$TEST") + fi + done + + # Handle invalid tests + if [[ ${#INVALID_TESTS[@]} -gt 0 ]]; then + echo "::error::Invalid test(s) selected: ${INVALID_TESTS[*]}" + echo "Valid tests are: $(echo "$VALID_TESTS" | tr '\n' ', ')" + exit 1 + fi + + # Build result JSON with defaults applied + RESULT='{"include":[' + FIRST=true + for TEST in "${FILTERED_TESTS[@]}"; do + CONFIG=$(jq -c --arg test "$TEST" '.include[] | select(.test == $test)' <<< "$ALL_TESTS") + FILTERED_WITH_DEFAULTS=$(apply_defaults "$CONFIG") + if [[ "$FIRST" == true ]]; then + FIRST=false + else + RESULT="${RESULT}," + fi + RESULT="${RESULT}${FILTERED_WITH_DEFAULTS}" + done + RESULT="${RESULT}]}" + + # Output the matrix for GitHub Actions + echo "Final matrix configuration:" + echo "$RESULT" | jq . + + # Fix: Use block redirection + { + echo "matrix<> "$GITHUB_OUTPUT" + + echo "=== Matrix Preparation Complete ===" + + ## ====================================================================== + ## Job: build-deb + ## ====================================================================== + + build-deb: + name: Build Apache Cloudberry DEB + env: + JOB_TYPE: build + needs: [check-skip] + runs-on: ubuntu-22.04 + timeout-minutes: 120 + if: github.event.inputs.reuse_artifacts_from_run_id == '' + outputs: + build_timestamp: ${{ steps.set_timestamp.outputs.timestamp }} + + container: + image: apache/incubator-cloudberry:cbdb-build-ubuntu22.04-latest + options: >- + --user root + -h cdw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt + + steps: + - name: Free Disk Space + if: needs.check-skip.outputs.should_skip != 'true' + run: | + echo "=== Disk space before cleanup ===" + df -h / + + # Remove pre-installed tools from host to free disk space + rm -rf /host_opt/hostedtoolcache || true # GitHub Actions tool cache + rm -rf /host_usr_local/lib/android || true # Android SDK + rm -rf /host_usr_share/dotnet || true # .NET SDK + rm -rf /host_opt/ghc || true # Haskell GHC + rm -rf /host_usr_local/.ghcup || true # Haskell GHCup + rm -rf /host_usr_share/swift || true # Swift + rm -rf /host_usr_local/share/powershell || true # PowerShell + rm -rf /host_usr_local/share/chromium || true # Chromium + rm -rf /host_usr_share/miniconda || true # Miniconda + rm -rf /host_opt/az || true # Azure CLI + rm -rf /host_usr_share/sbt || true # Scala Build Tool + + echo "=== Disk space after cleanup ===" + df -h / + + - name: Skip Check + if: needs.check-skip.outputs.should_skip == 'true' + run: | + echo "Build skipped via CI skip flag" >> "$GITHUB_STEP_SUMMARY" + exit 0 + + - name: Set build timestamp + id: set_timestamp # Add an ID to reference this step + run: | + timestamp=$(date +'%Y%m%d_%H%M%S') + echo "timestamp=$timestamp" | tee -a "$GITHUB_OUTPUT" # Use GITHUB_OUTPUT for job outputs + echo "BUILD_TIMESTAMP=$timestamp" | tee -a "$GITHUB_ENV" # Also set as environment variable + + - name: Checkout Apache Cloudberry + uses: actions/checkout@v4 + with: + fetch-depth: 1 + submodules: true + + - name: Cloudberry Environment Initialization + shell: bash + env: + LOGS_DIR: build-logs + run: | + set -eo pipefail + if ! su - gpadmin -c "/tmp/init_system.sh"; then + echo "::error::Container initialization failed" + exit 1 + fi + + mkdir -p "${LOGS_DIR}/details" + chown -R gpadmin:gpadmin . + chmod -R 755 . + chmod 777 "${LOGS_DIR}" + + df -kh / + rm -rf /__t/* + df -kh / + + df -h | tee -a "${LOGS_DIR}/details/disk-usage.log" + free -h | tee -a "${LOGS_DIR}/details/memory-usage.log" + + { + echo "=== Environment Information ===" + uname -a + df -h + free -h + env + } | tee -a "${LOGS_DIR}/details/environment.log" + + echo "SRC_DIR=${GITHUB_WORKSPACE}" | tee -a "$GITHUB_ENV" + + - name: Generate Build Job Summary Start + run: | + { + echo "# Build Job Summary" + echo "## Environment" + echo "- Start Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + echo "- ENABLE_DEBUG: ${{ env.ENABLE_DEBUG }}" + echo "- OS Version: $(lsb_release -sd)" + echo "- GCC Version: $(gcc --version | head -n1)" + } >> "$GITHUB_STEP_SUMMARY" + + - name: Run Apache Cloudberry configure script + shell: bash + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + export BUILD_DESTINATION=${SRC_DIR}/debian/build + + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ENABLE_DEBUG=${{ env.ENABLE_DEBUG }} BUILD_DESTINATION=${BUILD_DESTINATION} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh"; then + echo "::error::Configure script failed" + exit 1 + fi + + - name: Run Apache Cloudberry build script + shell: bash + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + export BUILD_DESTINATION=${SRC_DIR}/debian/build + + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/build-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} BUILD_DESTINATION=${BUILD_DESTINATION} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/build-cloudberry.sh"; then + echo "::error::Build script failed" + exit 1 + fi + + - name: Verify build artifacts + shell: bash + run: | + set -eo pipefail + + export BUILD_DESTINATION=${SRC_DIR}/debian/build + + echo "Verifying build artifacts..." + { + echo "=== Build Artifacts Verification ===" + echo "Timestamp: $(date -u)" + + if [ ! -d "${BUILD_DESTINATION}" ]; then + echo "::error::Build artifacts directory not found" + exit 1 + fi + + # Verify critical binaries + critical_binaries=( + "${BUILD_DESTINATION}/bin/postgres" + "${BUILD_DESTINATION}/bin/psql" + ) + + echo "Checking critical binaries..." + for binary in "${critical_binaries[@]}"; do + if [ ! -f "$binary" ]; then + echo "::error::Critical binary missing: $binary" + exit 1 + fi + if [ ! -x "$binary" ]; then + echo "::error::Binary not executable: $binary" + exit 1 + fi + echo "Binary verified: $binary" + ls -l "$binary" + done + + # Test binary execution + echo "Testing binary execution..." + if ! ${BUILD_DESTINATION}/bin/postgres --version; then + echo "::error::postgres binary verification failed" + exit 1 + fi + if ! ${BUILD_DESTINATION}/bin/psql --version; then + echo "::error::psql binary verification failed" + exit 1 + fi + + echo "All build artifacts verified successfully" + } 2>&1 | tee -a build-logs/details/build-verification.log + + - name: Create Source tarball, create DEB and verify artifacts + shell: bash + env: + CBDB_VERSION: 99.0.0 + BUILD_NUMBER: 1 + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + { + echo "=== Artifact Creation Log ===" + echo "Timestamp: $(date -u)" + + cp -r "${SRC_DIR}"/devops/build/packaging/deb/ubuntu22.04/* debian/ + chown -R "$(whoami)" debian + chmod -x debian/*install + + # replace not supported symbols in version + CBDB_VERSION=$(echo "$CBDB_VERSION" | sed "s/\//./g") + CBDB_VERSION=$(echo "$CBDB_VERSION" | sed "s/_/-/g") + + echo "We will built ${CBDB_VERSION}" + export BUILD_DESTINATION=${SRC_DIR}/debian/build + + if ! ${SRC_DIR}/devops/build/packaging/deb/build-deb.sh -v $CBDB_VERSION; then + echo "::error::Build script failed" + exit 1 + fi + + ARCH="amd64" + CBDB_PKG_VERSION=${CBDB_VERSION}-${BUILD_NUMBER}-$(git --git-dir=.git rev-list HEAD --count).$(git --git-dir=.git rev-parse --short HEAD) + + echo "Produced artifacts" + ls -l ../ + + echo "Copy artifacts to subdirectory for sign/upload" + mkdir ${SRC_DIR}/deb + DEB_FILE="apache-cloudberry-db-incubating_${CBDB_PKG_VERSION}"_"${ARCH}".deb + DBG_DEB_FILE="apache-cloudberry-db-incubating-dbgsym_${CBDB_PKG_VERSION}"_"${ARCH}".ddeb + CHANGES_DEB_FILE="apache-cloudberry-db-incubating_${CBDB_PKG_VERSION}"_"${ARCH}".changes + BUILDINFO_DEB_FILE="apache-cloudberry-db-incubating_${CBDB_PKG_VERSION}"_"${ARCH}".buildinfo + DSC_DEB_FILE="apache-cloudberry-db-incubating_${CBDB_PKG_VERSION}".dsc + SOURCE_FILE="apache-cloudberry-db-incubating_${CBDB_PKG_VERSION}".tar.xz + cp ../"${DEB_FILE}" "${SRC_DIR}/deb" + cp ../"${DBG_DEB_FILE}" "${SRC_DIR}/deb" + cp ../"${CHANGES_DEB_FILE}" "${SRC_DIR}/deb" + cp ../"${BUILDINFO_DEB_FILE}" "${SRC_DIR}/deb" + cp ../"${DSC_DEB_FILE}" "${SRC_DIR}/deb" + cp ../"${SOURCE_FILE}" "${SRC_DIR}/deb" + mkdir "${SRC_DIR}/deb/debian" + cp debian/changelog "${SRC_DIR}/deb/debian" + + # Get package information + echo "Package Information:" + dpkg --info "${SRC_DIR}/deb/${DEB_FILE}" + dpkg --contents "${SRC_DIR}/deb/${DEB_FILE}" + + # Verify critical files in DEB + echo "Verifying critical files in DEB..." + for binary in "bin/postgres" "bin/psql"; do + if ! dpkg --contents "${SRC_DIR}/deb/${DEB_FILE}" | grep -c "${binary}$"; then + echo "::error::Critical binary '${binary}' not found in DEB" + exit 1 + fi + done + + # Record checksums + echo "Calculating checksums..." + sha256sum "${SRC_DIR}/deb/${DEB_FILE}" | tee -a build-logs/details/checksums.log + + echo "Artifacts created and verified successfully" + + + } 2>&1 | tee -a build-logs/details/artifact-creation.log + + - name: Run Apache Cloudberry unittest script + if: needs.check-skip.outputs.should_skip != 'true' + shell: bash + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/unittest-cloudberry.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/unittest-cloudberry.sh"; then + echo "::error::Unittest script failed" + exit 1 + fi + + - name: Generate Build Job Summary End + run: | + { + echo "## Build Results" + echo "- End Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + } >> "$GITHUB_STEP_SUMMARY" + + - name: Upload build logs + uses: actions/upload-artifact@v4 + with: + name: build-logs-${{ env.BUILD_TIMESTAMP }} + path: | + build-logs/ + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + - name: Upload Cloudberry DEB build artifacts + uses: actions/upload-artifact@v4 + with: + name: apache-cloudberry-db-incubating-deb-build-artifacts + retention-days: ${{ env.LOG_RETENTION_DAYS }} + if-no-files-found: error + path: | + deb/*.deb + deb/*.ddeb + + - name: Upload Cloudberry deb source build artifacts + uses: actions/upload-artifact@v4 + with: + name: apache-cloudberry-db-incubating-deb-source-build-artifacts + retention-days: ${{ env.LOG_RETENTION_DAYS }} + if-no-files-found: error + path: | + deb/*.tar.xz + deb/*.changes + deb/*.dsc + deb/*.buildinfo + deb/debian/changelog + + ## ====================================================================== + ## Job: deb-install-test + ## ====================================================================== + + deb-install-test: + name: DEB Install Test Apache Cloudberry + needs: [check-skip, build-deb] + if: | + !cancelled() && + (needs.build-deb.result == 'success' || needs.build-deb.result == 'skipped') && + github.event.inputs.reuse_artifacts_from_run_id == '' + runs-on: ubuntu-22.04 + timeout-minutes: 120 + + container: + image: apache/incubator-cloudberry:cbdb-test-ubuntu22.04-latest + options: >- + --user root + -h cdw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt + + steps: + - name: Free Disk Space + if: needs.check-skip.outputs.should_skip != 'true' + run: | + echo "=== Disk space before cleanup ===" + df -h / + + # Remove pre-installed tools from host to free disk space + rm -rf /host_opt/hostedtoolcache || true # GitHub Actions tool cache + rm -rf /host_usr_local/lib/android || true # Android SDK + rm -rf /host_usr_share/dotnet || true # .NET SDK + rm -rf /host_opt/ghc || true # Haskell GHC + rm -rf /host_usr_local/.ghcup || true # Haskell GHCup + rm -rf /host_usr_share/swift || true # Swift + rm -rf /host_usr_local/share/powershell || true # PowerShell + rm -rf /host_usr_local/share/chromium || true # Chromium + rm -rf /host_usr_share/miniconda || true # Miniconda + rm -rf /host_opt/az || true # Azure CLI + rm -rf /host_usr_share/sbt || true # Scala Build Tool + + echo "=== Disk space after cleanup ===" + df -h / + + - name: Skip Check + if: needs.check-skip.outputs.should_skip == 'true' + run: | + echo "DEB install test skipped via CI skip flag" >> "$GITHUB_STEP_SUMMARY" + exit 0 + + - name: Download Cloudberry DEB build artifacts + if: needs.check-skip.outputs.should_skip != 'true' + uses: actions/download-artifact@v4 + with: + name: apache-cloudberry-db-incubating-deb-build-artifacts + path: ${{ github.workspace }}/deb_build_artifacts + run-id: ${{ github.event.inputs.reuse_artifacts_from_run_id || github.run_id }} + merge-multiple: false + + - name: Cloudberry Environment Initialization + if: needs.check-skip.outputs.should_skip != 'true' + shell: bash + env: + LOGS_DIR: install-logs + run: | + set -eo pipefail + if ! su - gpadmin -c "/tmp/init_system.sh"; then + echo "::error::Container initialization failed" + exit 1 + fi + + mkdir -p "${LOGS_DIR}/details" + chown -R gpadmin:gpadmin . + chmod -R 755 . + chmod 777 "${LOGS_DIR}" + + df -kh / + rm -rf /__t/* + df -kh / + + df -h | tee -a "${LOGS_DIR}/details/disk-usage.log" + free -h | tee -a "${LOGS_DIR}/details/memory-usage.log" + + { + echo "=== Environment Information ===" + uname -a + df -h + free -h + env + } | tee -a "${LOGS_DIR}/details/environment.log" + + echo "SRC_DIR=${GITHUB_WORKSPACE}" | tee -a "$GITHUB_ENV" + + - name: Verify DEB artifacts + id: verify-artifacts + shell: bash + run: | + set -eo pipefail + + DEB_FILE=$(ls "${GITHUB_WORKSPACE}"/deb_build_artifacts/*.deb) + if [ ! -f "${DEB_FILE}" ]; then + echo "::error::DEB file not found" + exit 1 + fi + + echo "deb_file=${DEB_FILE}" >> "$GITHUB_OUTPUT" + + echo "Verifying DEB artifacts..." + { + echo "=== DEB Verification Summary ===" + echo "Timestamp: $(date -u)" + echo "DEB File: ${DEB_FILE}" + + # Get DEB metadata and verify contents + echo "Package Information:" + dpkg-deb -f "${DEB_FILE}" + + # Get key DEB attributes for verification + DEB_VERSION=$(dpkg-deb -f "${DEB_FILE}" Version | cut -d'-' -f 1) + DEB_RELEASE=$(dpkg-deb -f "${DEB_FILE}" Version | cut -d'-' -f 3) + echo "version=${DEB_VERSION}" >> "$GITHUB_OUTPUT" + echo "release=${DEB_RELEASE}" >> "$GITHUB_OUTPUT" + + # Verify expected binaries are in the DEB + echo "Verifying critical files in DEB..." + for binary in "bin/postgres" "bin/psql"; do + if ! dpkg-deb -c "${DEB_FILE}" | grep "${binary}" > /dev/null; then + echo "::error::Critical binary '${binary}' not found in DEB" + exit 1 + fi + done + + echo "DEB Details:" + echo "- Version: ${DEB_VERSION}" + echo "- Release: ${DEB_RELEASE}" + + # Calculate and store checksum + echo "Checksum:" + sha256sum "${DEB_FILE}" + + } 2>&1 | tee -a install-logs/details/deb-verification.log + + - name: Install Cloudberry DEB + shell: bash + env: + DEB_FILE: ${{ steps.verify-artifacts.outputs.deb_file }} + DEB_VERSION: ${{ steps.verify-artifacts.outputs.version }} + DEB_RELEASE: ${{ steps.verify-artifacts.outputs.release }} + run: | + set -eo pipefail + + if [ -z "${DEB_FILE}" ]; then + echo "::error::DEB_FILE environment variable is not set" + exit 1 + fi + + { + echo "=== DEB Installation Log ===" + echo "Timestamp: $(date -u)" + echo "DEB File: ${DEB_FILE}" + echo "Version: ${DEB_VERSION}" + echo "Release: ${DEB_RELEASE}" + + # Clean install location + rm -rf /usr/local/cloudberry-db + + # Install DEB + echo "Starting installation..." + apt-get update + if ! apt-get -y install "${DEB_FILE}"; then + echo "::error::DEB installation failed" + exit 1 + fi + + # Change ownership back to gpadmin - it is needed for future tests + chown -R gpadmin:gpadmin /usr/local/cloudberry-db + + echo "Installation completed successfully" + dpkg-query -s apache-cloudberry-db-incubating + echo "Installed files:" + dpkg-query -L apache-cloudberry-db-incubating + } 2>&1 | tee -a install-logs/details/deb-installation.log + + - name: Upload install logs + uses: actions/upload-artifact@v4 + with: + name: install-logs-${{ matrix.name }}-${{ needs.build-deb.outputs.build_timestamp }} + path: | + install-logs/ + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + - name: Generate Install Test Job Summary End + if: always() + shell: bash {0} + run: | + { + echo "# Installed Package Summary" + echo "\`\`\`" + + dpkg-query -s apache-cloudberry-db-incubating + echo "\`\`\`" + } >> "$GITHUB_STEP_SUMMARY" || true + + ## ====================================================================== + ## Job: test-deb + ## ====================================================================== + + test-deb: + name: ${{ matrix.test }} + needs: [check-skip, build-deb, prepare-test-matrix-deb] + if: | + !cancelled() && + (needs.build-deb.result == 'success' || needs.build-deb.result == 'skipped') + runs-on: ubuntu-22.04 + timeout-minutes: 120 + # actionlint-allow matrix[*].pg_settings + strategy: + fail-fast: false # Continue with other tests if one fails + matrix: ${{ fromJson(needs.prepare-test-matrix-deb.outputs.test-matrix) }} + + container: + image: apache/incubator-cloudberry:cbdb-build-ubuntu22.04-latest + options: >- + --privileged + --user root + --hostname cdw + --shm-size=2gb + --ulimit core=-1 + --cgroupns=host + -v /sys/fs/cgroup:/sys/fs/cgroup:rw + -v /usr/share:/host_usr_share + -v /usr/local:/host_usr_local + -v /opt:/host_opt + + steps: + - name: Free Disk Space + if: needs.check-skip.outputs.should_skip != 'true' + run: | + echo "=== Disk space before cleanup ===" + df -h / + + # Remove pre-installed tools from host to free disk space + rm -rf /host_opt/hostedtoolcache || true # GitHub Actions tool cache + rm -rf /host_usr_local/lib/android || true # Android SDK + rm -rf /host_usr_share/dotnet || true # .NET SDK + rm -rf /host_opt/ghc || true # Haskell GHC + rm -rf /host_usr_local/.ghcup || true # Haskell GHCup + rm -rf /host_usr_share/swift || true # Swift + rm -rf /host_usr_local/share/powershell || true # PowerShell + rm -rf /host_usr_local/share/chromium || true # Chromium + rm -rf /host_usr_share/miniconda || true # Miniconda + rm -rf /host_opt/az || true # Azure CLI + rm -rf /host_usr_share/sbt || true # Scala Build Tool + + echo "=== Disk space after cleanup ===" + df -h / + + - name: Skip Check + if: needs.check-skip.outputs.should_skip == 'true' + run: | + echo "Test ${{ matrix.test }} skipped via CI skip flag" >> "$GITHUB_STEP_SUMMARY" + exit 0 + + - name: Use timestamp from previous job + if: needs.check-skip.outputs.should_skip != 'true' + run: | + echo "Timestamp from output: ${{ needs.build-deb.outputs.build_timestamp }}" + + - name: Cloudberry Environment Initialization + shell: bash + env: + LOGS_DIR: build-logs + run: | + set -eo pipefail + if ! su - gpadmin -c "/tmp/init_system.sh"; then + echo "::error::Container initialization failed" + exit 1 + fi + + mkdir -p "${LOGS_DIR}/details" + chown -R gpadmin:gpadmin . + chmod -R 755 . + chmod 777 "${LOGS_DIR}" + + df -kh / + rm -rf /__t/* + df -kh / + + df -h | tee -a "${LOGS_DIR}/details/disk-usage.log" + free -h | tee -a "${LOGS_DIR}/details/memory-usage.log" + + { + echo "=== Environment Information ===" + uname -a + df -h + free -h + env + } | tee -a "${LOGS_DIR}/details/environment.log" + + echo "SRC_DIR=${GITHUB_WORKSPACE}" | tee -a "$GITHUB_ENV" + + - name: Setup cgroups + if: needs.check-skip.outputs.should_skip != 'true' + shell: bash + run: | + set -uxo pipefail + + if [ "${{ matrix.enable_cgroups }}" = "true" ]; then + + echo "Current mounts:" + mount | grep cgroup + + CGROUP_BASEDIR=/sys/fs/cgroup + + # 1. Basic setup with permissions + sudo chmod -R 777 ${CGROUP_BASEDIR}/ + sudo mkdir -p ${CGROUP_BASEDIR}/gpdb + sudo chmod -R 777 ${CGROUP_BASEDIR}/gpdb + sudo chown -R gpadmin:gpadmin ${CGROUP_BASEDIR}/gpdb + + # 2. Enable controllers + sudo bash -c "echo '+cpu +cpuset +memory +io' > ${CGROUP_BASEDIR}/cgroup.subtree_control" || true + sudo bash -c "echo '+cpu +cpuset +memory +io' > ${CGROUP_BASEDIR}/gpdb/cgroup.subtree_control" || true + + # 3. CPU settings + sudo bash -c "echo 'max 100000' > ${CGROUP_BASEDIR}/gpdb/cpu.max" || true + sudo bash -c "echo '100' > ${CGROUP_BASEDIR}/gpdb/cpu.weight" || true + sudo bash -c "echo '0' > ${CGROUP_BASEDIR}/gpdb/cpu.weight.nice" || true + sudo bash -c "echo 0-$(( $(nproc) - 1 )) > ${CGROUP_BASEDIR}/gpdb/cpuset.cpus" || true + sudo bash -c "echo '0' > ${CGROUP_BASEDIR}/gpdb/cpuset.mems" || true + + # 4. Memory settings + sudo bash -c "echo 'max' > ${CGROUP_BASEDIR}/gpdb/memory.max" || true + sudo bash -c "echo '0' > ${CGROUP_BASEDIR}/gpdb/memory.min" || true + sudo bash -c "echo 'max' > ${CGROUP_BASEDIR}/gpdb/memory.high" || true + + # 5. IO settings + echo "Available block devices:" + lsblk + + sudo bash -c " + if [ -f \${CGROUP_BASEDIR}/gpdb/io.stat ]; then + echo 'Detected IO devices:' + cat \${CGROUP_BASEDIR}/gpdb/io.stat + fi + echo '' > \${CGROUP_BASEDIR}/gpdb/io.max || true + " + + # 6. Fix permissions again after all writes + sudo chmod -R 777 ${CGROUP_BASEDIR}/gpdb + sudo chown -R gpadmin:gpadmin ${CGROUP_BASEDIR}/gpdb + + # 7. Check required files + echo "Checking required files:" + required_files=( + "cgroup.procs" + "cpu.max" + "cpu.pressure" + "cpu.weight" + "cpu.weight.nice" + "cpu.stat" + "cpuset.cpus" + "cpuset.mems" + "cpuset.cpus.effective" + "cpuset.mems.effective" + "memory.current" + "io.max" + ) + + for file in "${required_files[@]}"; do + if [ -f "${CGROUP_BASEDIR}/gpdb/$file" ]; then + echo "✓ $file exists" + ls -l "${CGROUP_BASEDIR}/gpdb/$file" + else + echo "✗ $file missing" + fi + done + + # 8. Test subdirectory creation + echo "Testing subdirectory creation..." + sudo -u gpadmin bash -c " + TEST_DIR=\${CGROUP_BASEDIR}/gpdb/test6448 + if mkdir -p \$TEST_DIR; then + echo 'Created test directory' + sudo chmod -R 777 \$TEST_DIR + if echo \$\$ > \$TEST_DIR/cgroup.procs; then + echo 'Successfully wrote to cgroup.procs' + cat \$TEST_DIR/cgroup.procs + # Move processes back to parent before cleanup + echo \$\$ > \${CGROUP_BASEDIR}/gpdb/cgroup.procs + else + echo 'Failed to write to cgroup.procs' + ls -la \$TEST_DIR/cgroup.procs + fi + ls -la \$TEST_DIR/ + rmdir \$TEST_DIR || { + echo 'Moving all processes to parent before cleanup' + cat \$TEST_DIR/cgroup.procs | while read pid; do + echo \$pid > \${CGROUP_BASEDIR}/gpdb/cgroup.procs 2>/dev/null || true + done + rmdir \$TEST_DIR + } + else + echo 'Failed to create test directory' + fi + " + + # 9. Verify setup as gpadmin user + echo "Testing cgroup access as gpadmin..." + sudo -u gpadmin bash -c " + echo 'Checking mounts...' + mount | grep cgroup + + echo 'Checking /proc/self/mounts...' + cat /proc/self/mounts | grep cgroup + + if ! grep -q cgroup2 /proc/self/mounts; then + echo 'ERROR: cgroup2 mount NOT visible to gpadmin' + exit 1 + fi + echo 'SUCCESS: cgroup2 mount visible to gpadmin' + + if ! [ -w ${CGROUP_BASEDIR}/gpdb ]; then + echo 'ERROR: gpadmin cannot write to gpdb cgroup' + exit 1 + fi + echo 'SUCCESS: gpadmin can write to gpdb cgroup' + + echo 'Verifying key files content:' + echo 'cpu.max:' + cat ${CGROUP_BASEDIR}/gpdb/cpu.max || echo 'Failed to read cpu.max' + echo 'cpuset.cpus:' + cat ${CGROUP_BASEDIR}/gpdb/cpuset.cpus || echo 'Failed to read cpuset.cpus' + echo 'cgroup.subtree_control:' + cat ${CGROUP_BASEDIR}/gpdb/cgroup.subtree_control || echo 'Failed to read cgroup.subtree_control' + " + + # 10. Show final state + echo "Final cgroup state:" + ls -la ${CGROUP_BASEDIR}/gpdb/ + echo "Cgroup setup completed successfully" + else + echo "Cgroup setup skipped" + fi + + - name: "Generate Test Job Summary Start: ${{ matrix.test }}" + if: always() + run: | + { + echo "# Test Job Summary: ${{ matrix.test }}" + echo "## Environment" + echo "- Start Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + + if [[ "${{ needs.check-skip.outputs.should_skip }}" == "true" ]]; then + echo "## Skip Status" + echo "✓ Test execution skipped via CI skip flag" + else + echo "- OS Version: $(cat /etc/redhat-release)" + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: Download Cloudberry DEB build artifacts + if: needs.check-skip.outputs.should_skip != 'true' + uses: actions/download-artifact@v4 + with: + name: apache-cloudberry-db-incubating-deb-build-artifacts + path: ${{ github.workspace }}/deb_build_artifacts + merge-multiple: false + run-id: ${{ github.event.inputs.reuse_artifacts_from_run_id || github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download Cloudberry Source build artifacts + if: needs.check-skip.outputs.should_skip != 'true' + uses: actions/download-artifact@v4 + with: + name: apache-cloudberry-db-incubating-deb-source-build-artifacts + path: ${{ github.workspace }}/source_build_artifacts + merge-multiple: false + run-id: ${{ github.event.inputs.reuse_artifacts_from_run_id || github.run_id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Verify DEB artifacts + if: needs.check-skip.outputs.should_skip != 'true' + id: verify-artifacts + shell: bash + run: | + set -eo pipefail + + SRC_TARBALL_FILE=$(ls "${GITHUB_WORKSPACE}"/source_build_artifacts/apache-cloudberry-db-incubating_*.tar.xz) + if [ ! -f "${SRC_TARBALL_FILE}" ]; then + echo "::error::SRC TARBALL file not found" + exit 1 + fi + + echo "src_tarball_file=${SRC_TARBALL_FILE}" >> "$GITHUB_OUTPUT" + + echo "Verifying SRC TARBALL artifacts..." + { + echo "=== SRC TARBALL Verification Summary ===" + echo "Timestamp: $(date -u)" + echo "SRC TARBALL File: ${SRC_TARBALL_FILE}" + + # Calculate and store checksum + echo "Checksum:" + sha256sum "${SRC_TARBALL_FILE}" + + } 2>&1 | tee -a build-logs/details/src-tarball-verification.log + + DEB_FILE=$(ls "${GITHUB_WORKSPACE}"/deb_build_artifacts/*.deb) + if [ ! -f "${DEB_FILE}" ]; then + echo "::error::DEB file not found" + exit 1 + fi + + echo "deb_file=${DEB_FILE}" >> "$GITHUB_OUTPUT" + + echo "Verifying DEB artifacts..." + { + echo "=== DEB Verification Summary ===" + echo "Timestamp: $(date -u)" + echo "DEB File: ${DEB_FILE}" + + # Get DEB metadata and verify contents + echo "Package Information:" + dpkg-deb -f "${DEB_FILE}" + + # Get key DEB attributes for verification + DEB_VERSION=$(dpkg-deb -f "${DEB_FILE}" Version | cut -d'-' -f 1) + DEB_RELEASE=$(dpkg-deb -f "${DEB_FILE}" Version | cut -d'-' -f 3) + echo "version=${DEB_VERSION}" >> "$GITHUB_OUTPUT" + echo "release=${DEB_RELEASE}" >> "$GITHUB_OUTPUT" + + # Verify expected binaries are in the DEB + echo "Verifying critical files in DEB..." + for binary in "bin/postgres" "bin/psql"; do + if ! dpkg-deb -c "${DEB_FILE}" | grep "${binary}" > /dev/null; then + echo "::error::Critical binary '${binary}' not found in DEB" + exit 1 + fi + done + + echo "DEB Details:" + echo "- Version: ${DEB_VERSION}" + echo "- Release: ${DEB_RELEASE}" + + # Calculate and store checksum + echo "Checksum:" + sha256sum "${DEB_FILE}" + + } 2>&1 | tee -a build-logs/details/deb-verification.log + + - name: Install Cloudberry DEB + if: success() && needs.check-skip.outputs.should_skip != 'true' + shell: bash + env: + DEB_FILE: ${{ steps.verify-artifacts.outputs.deb_file }} + DEB_VERSION: ${{ steps.verify-artifacts.outputs.version }} + DEB_RELEASE: ${{ steps.verify-artifacts.outputs.release }} + run: | + set -eo pipefail + + if [ -z "${DEB_FILE}" ]; then + echo "::error::DEB_FILE environment variable is not set" + exit 1 + fi + + { + echo "=== DEB Installation Log ===" + echo "Timestamp: $(date -u)" + echo "DEB File: ${DEB_FILE}" + echo "Version: ${DEB_VERSION}" + echo "Release: ${DEB_RELEASE}" + + # Clean install location + rm -rf /usr/local/cloudberry-db + + # Install DEB + echo "Starting installation..." + apt-get update + if ! apt-get -y install "${DEB_FILE}"; then + echo "::error::DEB installation failed" + exit 1 + fi + + # Change ownership back to gpadmin - it is needed for future tests + chown -R gpadmin:gpadmin /usr/local/cloudberry-db + + echo "Installation completed successfully" + dpkg-query -s apache-cloudberry-db-incubating + echo "Installed files:" + dpkg-query -L apache-cloudberry-db-incubating + } 2>&1 | tee -a build-logs/details/deb-installation.log + + - name: Extract source tarball + if: success() && needs.check-skip.outputs.should_skip != 'true' + shell: bash + env: + SRC_TARBALL_FILE: ${{ steps.verify-artifacts.outputs.src_tarball_file }} + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + { + echo "=== Source Extraction Log ===" + echo "Timestamp: $(date -u)" + + echo "Starting extraction..." + file "${SRC_TARBALL_FILE}" + if ! time tar xf "${SRC_TARBALL_FILE}" -C "${SRC_DIR}"/.. ; then + echo "::error::Source extraction failed" + exit 1 + fi + + echo "Extraction completed successfully" + echo "Extracted contents:" + ls -la "${SRC_DIR}/../cloudberry" + echo "Directory size:" + du -sh "${SRC_DIR}/../cloudberry" + } 2>&1 | tee -a build-logs/details/source-extraction.log + + - name: Prepare DEB Environment + if: success() && needs.check-skip.outputs.should_skip != 'true' + shell: bash + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + { + + # change ownership to gpadmin + chown -R gpadmin "${SRC_DIR}/../cloudberry" + touch build-logs/sections.log + chown gpadmin build-logs/sections.log + chmod 777 build-logs + + # configure link lib directory to temporary location, fix it + rm -rf "${SRC_DIR}"/debian/build/lib + ln -sf /usr/cloudberry-db/lib "${SRC_DIR}"/debian/build/lib + + # check if regress.so exists in src directory - it is needed for contrib/dblink tests + if [ ! -f ${SRC_DIR}/src/test/regress/regress.so ]; then + ln -sf /usr/cloudberry-db/lib/postgresql/regress.so ${SRC_DIR}/src/test/regress/regress.so + fi + + # FIXME + # temporary install gdb - delete after creating new docker build/test contaners + apt-get update + apt-get -y install gdb + + } 2>&1 | tee -a build-logs/details/prepare-deb-env.log + + - name: Create Apache Cloudberry demo cluster + if: success() && needs.check-skip.outputs.should_skip != 'true' + shell: bash + env: + SRC_DIR: ${{ github.workspace }} + run: | + set -eo pipefail + + { + chmod +x "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh + if ! time su - gpadmin -c "cd ${SRC_DIR} && NUM_PRIMARY_MIRROR_PAIRS='${{ matrix.num_primary_mirror_pairs }}' SRC_DIR=${SRC_DIR} ${SRC_DIR}/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh"; then + echo "::error::Demo cluster creation failed" + exit 1 + fi + + } 2>&1 | tee -a build-logs/details/create-cloudberry-demo-cluster.log + + - name: "Run Tests: ${{ matrix.test }}" + if: success() && needs.check-skip.outputs.should_skip != 'true' + env: + SRC_DIR: ${{ github.workspace }} + shell: bash {0} + run: | + set -o pipefail + + # Initialize test status + overall_status=0 + + # Create logs directory structure + mkdir -p build-logs/details + + # Core file config + mkdir -p "/tmp/cloudberry-cores" + chmod 1777 "/tmp/cloudberry-cores" + sysctl -w kernel.core_pattern="/tmp/cloudberry-cores/core-%e-%s-%u-%g-%p-%t" + sysctl kernel.core_pattern + su - gpadmin -c "ulimit -c" + + # WARNING: PostgreSQL Settings + # When adding new pg_settings key/value pairs: + # 1. Add a new check below for the setting + # 2. Follow the same pattern as optimizer + # 3. Update matrix entries to include the new setting + + + # Set PostgreSQL options if defined + PG_OPTS="" + if [[ "${{ matrix.pg_settings.optimizer != '' }}" == "true" ]]; then + PG_OPTS="$PG_OPTS -c optimizer=${{ matrix.pg_settings.optimizer }}" + fi + + if [[ "${{ matrix.pg_settings.default_table_access_method != '' }}" == "true" ]]; then + PG_OPTS="$PG_OPTS -c default_table_access_method=${{ matrix.pg_settings.default_table_access_method }}" + fi + + # Read configs into array + IFS=' ' read -r -a configs <<< "${{ join(matrix.make_configs, ' ') }}" + + echo "=== Starting test execution for ${{ matrix.test }} ===" + echo "Number of configurations to execute: ${#configs[@]}" + echo "" + + # Execute each config separately + for ((i=0; i<${#configs[@]}; i++)); do + config="${configs[$i]}" + IFS=':' read -r dir target <<< "$config" + + echo "=== Executing configuration $((i+1))/${#configs[@]} ===" + echo "Make command: make -C $dir $target" + echo "Environment:" + echo "- PGOPTIONS: ${PG_OPTS}" + + # Create unique log file for this configuration + config_log="build-logs/details/make-${{ matrix.test }}-config$i.log" + + # Clean up any existing core files + echo "Cleaning up existing core files..." + rm -f /tmp/cloudberry-cores/core-* + + # Execute test script with proper environment setup + if ! time su - gpadmin -c "cd ${SRC_DIR} && \ + MAKE_NAME='${{ matrix.test }}-config$i' \ + MAKE_TARGET='$target' \ + MAKE_DIRECTORY='-C $dir' \ + PGOPTIONS='${PG_OPTS}' \ + SRC_DIR='${SRC_DIR}' \ + ${SRC_DIR}/devops/build/automation/cloudberry/scripts/test-cloudberry.sh" \ + 2>&1 | tee "$config_log"; then + echo "::warning::Test execution failed for configuration $((i+1)): make -C $dir $target" + overall_status=1 + fi + + # Check for results directory + results_dir="${dir}/results" + + if [[ -d "$results_dir" ]]; then + echo "-----------------------------------------" | tee -a build-logs/details/make-${{ matrix.test }}-config$i-results.log + echo "Found results directory: $results_dir" | tee -a build-logs/details/make-${{ matrix.test }}-config$i-results.log + echo "Contents of results directory:" | tee -a build-logs/details/make-${{ matrix.test }}-config$i-results.log + + find "$results_dir" -type f -ls >> "$log_file" 2>&1 | tee -a build-logs/details/make-${{ matrix.test }}-config$i-results.log + echo "-----------------------------------------" | tee -a build-logs/details/make-${{ matrix.test }}-config$i-results.log + else + echo "-----------------------------------------" + echo "Results directory $results_dir does not exit" + echo "-----------------------------------------" + fi + + # Analyze any core files generated by this test configuration + echo "Analyzing core files for configuration ${{ matrix.test }}-config$i..." + test_id="${{ matrix.test }}-config$i" + + # List the cores directory + echo "-----------------------------------------" + echo "Cores directory: /tmp/cloudberry-cores" + echo "Contents of cores directory:" + ls -Rl "/tmp/cloudberry-cores" + echo "-----------------------------------------" + + "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/analyze_core_dumps.sh "$test_id" + core_analysis_rc=$? + case "$core_analysis_rc" in + 0) echo "No core dumps found for this configuration" ;; + 1) echo "Core dumps were found and analyzed successfully" ;; + 2) echo "::warning::Issues encountered during core dump analysis" ;; + *) echo "::error::Unexpected return code from core dump analysis: $core_analysis_rc" ;; + esac + + echo "Log file: $config_log" + echo "=== End configuration $((i+1)) execution ===" + echo "" + done + + echo "=== Test execution completed ===" + echo "Log files:" + ls -l build-logs/details/ + + # Store number of configurations for parsing step + echo "NUM_CONFIGS=${#configs[@]}" >> "$GITHUB_ENV" + + # Report overall status + if [ $overall_status -eq 0 ]; then + echo "All test executions completed successfully" + else + echo "::warning::Some test executions failed, check individual logs for details" + fi + + exit $overall_status + + - name: "Parse Test Results: ${{ matrix.test }}" + id: test-results + if: always() && needs.check-skip.outputs.should_skip != 'true' + env: + SRC_DIR: ${{ github.workspace }} + shell: bash {0} + run: | + set -o pipefail + + overall_status=0 + + # Get configs array to create context for results + IFS=' ' read -r -a configs <<< "${{ join(matrix.make_configs, ' ') }}" + + echo "=== Starting results parsing for ${{ matrix.test }} ===" + echo "Number of configurations to parse: ${#configs[@]}" + echo "" + + # Parse each configuration's results independently + for ((i=0; i "test_results.$i.txt" + overall_status=1 + continue + fi + + # Parse this configuration's results + + MAKE_NAME="${{ matrix.test }}-config$i" \ + "${SRC_DIR}"/devops/build/automation/cloudberry/scripts/parse-test-results.sh "$config_log" + status_code=$? + + { + echo "SUITE_NAME=${{ matrix.test }}" + echo "DIR=${dir}" + echo "TARGET=${target}" + } >> test_results.txt + + # Process return code + case $status_code in + 0) # All tests passed + echo "All tests passed successfully" + if [ -f test_results.txt ]; then + (echo "MAKE_COMMAND=\"make -C $dir $target\""; cat test_results.txt) | tee "test_results.${{ matrix.test }}.$i.txt" + rm test_results.txt + fi + ;; + 1) # Tests failed but parsed successfully + echo "Test failures detected but properly parsed" + if [ -f test_results.txt ]; then + (echo "MAKE_COMMAND=\"make -C $dir $target\""; cat test_results.txt) | tee "test_results.${{ matrix.test }}.$i.txt" + rm test_results.txt + fi + overall_status=1 + ;; + 2) # Parse error or missing file + echo "::warning::Could not parse test results properly for configuration $((i+1))" + { + echo "MAKE_COMMAND=\"make -C $dir $target\"" + echo "STATUS=parse_error" + echo "TOTAL_TESTS=0" + echo "FAILED_TESTS=0" + echo "PASSED_TESTS=0" + echo "IGNORED_TESTS=0" + } | tee "test_results.${{ matrix.test }}.$i.txt" + overall_status=1 + ;; + *) # Unexpected error + echo "::warning::Unexpected error during test results parsing for configuration $((i+1))" + { + echo "MAKE_COMMAND=\"make -C $dir $target\"" + echo "STATUS=unknown_error" + echo "TOTAL_TESTS=0" + echo "FAILED_TESTS=0" + echo "PASSED_TESTS=0" + echo "IGNORED_TESTS=0" + } | tee "test_results.${{ matrix.test }}.$i.txt" + overall_status=1 + ;; + esac + + echo "Results stored in test_results.$i.txt" + echo "=== End parsing for configuration $((i+1)) ===" + echo "" + done + + # Report status of results files + echo "=== Results file status ===" + echo "Generated results files:" + for ((i=0; i> "$GITHUB_STEP_SUMMARY" || true + + - name: Upload test logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-logs-${{ matrix.test }}-${{ needs.build-deb.outputs.build_timestamp }} + path: | + build-logs/ + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + - name: Upload Test Metadata + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-metadata-${{ matrix.test }} + path: | + test_results*.txt + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + - name: Upload test results files + uses: actions/upload-artifact@v4 + with: + name: results-${{ matrix.test }}-${{ needs.build-deb.outputs.build_timestamp }} + path: | + **/regression.out + **/regression.diffs + **/results/ + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + - name: Upload test regression logs + if: failure() || cancelled() + uses: actions/upload-artifact@v4 + with: + name: regression-logs-${{ matrix.test }}-${{ needs.build-deb.outputs.build_timestamp }} + path: | + **/regression.out + **/regression.diffs + **/results/ + gpAux/gpdemo/datadirs/standby/log/ + gpAux/gpdemo/datadirs/qddir/demoDataDir-1/log/ + gpAux/gpdemo/datadirs/dbfast1/demoDataDir0/log/ + gpAux/gpdemo/datadirs/dbfast2/demoDataDir1/log/ + gpAux/gpdemo/datadirs/dbfast3/demoDataDir2/log/ + gpAux/gpdemo/datadirs/dbfast_mirror1/demoDataDir0/log/ + gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1/log/ + gpAux/gpdemo/datadirs/dbfast_mirror3/demoDataDir2/log/ + retention-days: ${{ env.LOG_RETENTION_DAYS }} + + ## ====================================================================== + ## Job: report-deb + ## ====================================================================== + + report-deb: + name: Generate Apache Cloudberry Build Report + needs: [check-skip, build-deb, prepare-test-matrix-deb, deb-install-test, test-deb] + if: always() + runs-on: ubuntu-22.04 + steps: + - name: Generate Final Report + run: | + { + echo "# Apache Cloudberry Build Pipeline Report" + + if [[ "${{ needs.check-skip.outputs.should_skip }}" == "true" ]]; then + echo "## CI Skip Status" + echo "✅ CI checks skipped via skip flag" + echo "- Completion Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + else + echo "## Job Status" + echo "- Build Job: ${{ needs.build-deb.result }}" + echo "- Test Job: ${{ needs.test-deb.result }}" + echo "- Completion Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + + if [[ "${{ needs.build-deb.result }}" == "success" && "${{ needs.test-deb.result }}" == "success" ]]; then + echo "✅ Pipeline completed successfully" + else + echo "⚠️ Pipeline completed with failures" + + if [[ "${{ needs.build-deb.result }}" != "success" ]]; then + echo "### Build Job Failure" + echo "Check build logs for details" + fi + + if [[ "${{ needs.test-deb.result }}" != "success" ]]; then + echo "### Test Job Failure" + echo "Check test logs and regression files for details" + fi + fi + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: Notify on failure + if: | + needs.check-skip.outputs.should_skip != 'true' && + (needs.build-deb.result != 'success' || needs.test-deb.result != 'success') + run: | + echo "::error::Build/Test pipeline failed! Check job summaries and logs for details" + echo "Timestamp: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + echo "Build Result: ${{ needs.build-deb.result }}" + echo "Test Result: ${{ needs.test-deb.result }}" diff --git a/.github/workflows/docker-cbdb-build-containers.yml b/.github/workflows/docker-cbdb-build-containers.yml new file mode 100644 index 00000000000..1b13e9ff3f4 --- /dev/null +++ b/.github/workflows/docker-cbdb-build-containers.yml @@ -0,0 +1,226 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# GitHub Actions Workflow for Apache Cloudberry Build Environments +# -------------------------------------------------------------------- +# Purpose: +# Builds, tests, and publishes multi-architecture Docker images for +# Apache Cloudberry DB build environments. Images are built for both +# Rocky Linux 8 and 9, tested with TestInfra, and pushed to DockerHub. +# +# Multi-Architecture Support: +# - Builds images for both AMD64 and ARM64 architectures +# - Creates and pushes multi-arch manifests +# - Uses QEMU for cross-platform builds +# - Automated testing for all architectures +# +# Image Tags: +# - Latest: cbdb-build-{platform}-latest +# - Versioned: cbdb-build-{platform}-{YYYYMMDD}-{git-short-sha} +# +# Features: +# - Matrix build for multiple platforms +# - Parallel architecture builds +# - Build caching strategy +# - Path filtering to only build changed platforms +# - Comprehensive build summary and metadata +# - Container testing with TestInfra +# - Multi-arch manifest creation +# +# Requirements: +# - DockerHub credentials in GitHub secrets +# - DOCKERHUB_USER +# - DOCKERHUB_TOKEN +# -------------------------------------------------------------------- + +name: docker-cbdb-build-containers + +# Trigger workflow on pushes to main when relevant paths change +# Also allows manual triggering via GitHub UI +on: + push: + branches: + - main + paths: + - 'devops/deploy/docker/build/rocky8/**' + - 'devops/deploy/docker/build/rocky9/**' + - 'devops/deploy/docker/build/ubuntu22.04/**' + pull_request: + paths: + - 'devops/deploy/docker/build/**' + workflow_dispatch: # Manual trigger + +# Prevent multiple workflow runs from interfering with each other +concurrency: + group: docker-build-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-and-push: + # Set timeout to prevent hanging builds + timeout-minutes: 60 + runs-on: ubuntu-latest + + # Matrix strategy to build for both Rocky Linux 8 and 9, Ubuntu 22.04 + strategy: + matrix: + platform: ['rocky8', 'rocky9', 'ubuntu22.04'] + + steps: + # Checkout repository code with full history + - name: Checkout code + uses: actions/checkout@v4 + + # Generate version information for image tags + # - BUILD_DATE: Current date in YYYYMMDD format + # - SHA_SHORT: Short form of the git commit SHA + - name: Set version + id: version + run: | + echo "BUILD_DATE=$(date -u +'%Y%m%d')" >> $GITHUB_OUTPUT + echo "SHA_SHORT=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + + # Determine if the current platform's files have changed + # This prevents unnecessary builds if only one platform was modified + - name: Determine if platform changed + id: platform-filter + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 + with: + filters: | + rocky8: + - 'devops/deploy/docker/build/rocky8/**' + rocky9: + - 'devops/deploy/docker/build/rocky9/**' + ubuntu22.04: + - 'devops/deploy/docker/build/ubuntu22.04/**' + + # Set up QEMU for multi-architecture support + # This allows building ARM64 images on AMD64 infrastructure and vice versa + - name: Set up QEMU + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' }} + uses: docker/setup-qemu-action@v3 + + # Login to DockerHub for pushing images + # Requires DOCKERHUB_USER and DOCKERHUB_TOKEN secrets to be set + - name: Login to Docker Hub + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main' }} + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USER }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + # Setup Docker Buildx for efficient builds + # Enable debug mode for better troubleshooting + - name: Set up Docker Buildx + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' }} + uses: docker/setup-buildx-action@v3 + with: + buildkitd-flags: --debug + + # Build and test images for each architecture + # This ensures both AMD64 and ARM64 variants work correctly + - name: Build and test images + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' }} + run: | + # Build for each platform + for arch in amd64 arm64; do + # Build the image for testing + docker buildx build \ + --platform linux/$arch \ + --load \ + -t apache/incubator-cloudberry:cbdb-build-${{ matrix.platform }}-$arch-test \ + ./devops/deploy/docker/build/${{ matrix.platform }} + + # Run tests in a container + docker run -d \ + -h cdw \ + --name cbdb-build-${{ matrix.platform }}-$arch-test \ + apache/incubator-cloudberry:cbdb-build-${{ matrix.platform }}-$arch-test \ + bash -c "sleep 30" + + # Execute TestInfra tests + docker exec cbdb-build-${{ matrix.platform }}-$arch-test pytest \ + --cache-clear \ + --disable-warnings \ + -p no:warnings \ + /tests/testinfra/test_cloudberry_db_env.py + + # Cleanup test container + docker rm -f cbdb-build-${{ matrix.platform }}-$arch-test + done + + # Build and push multi-architecture images + # This creates a manifest list that supports both architectures + - name: Build and Push Multi-arch Docker images + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main' }} + uses: docker/build-push-action@v6 + with: + context: ./devops/deploy/docker/build/${{ matrix.platform }} + push: true + platforms: linux/amd64,linux/arm64 + # Tag with both latest and version-specific tags + tags: | + apache/incubator-cloudberry:cbdb-build-${{ matrix.platform }}-latest + apache/incubator-cloudberry:cbdb-build-${{ matrix.platform }}-${{ steps.version.outputs.BUILD_DATE }}-${{ steps.version.outputs.SHA_SHORT }} + # Add standard Open Container Initiative (OCI) labels + labels: | + org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} + org.opencontainers.image.revision=${{ github.sha }} + org.opencontainers.image.created=${{ steps.version.outputs.BUILD_DATE }} + org.opencontainers.image.version=${{ steps.version.outputs.BUILD_DATE }}-${{ steps.version.outputs.SHA_SHORT }} + + # Generate a detailed build summary in GitHub Actions UI + # This provides quick access to build information and image usage instructions + - name: Build Summary + if: always() + run: | + # Add PR context notification + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + echo "#### ℹ️ Pull Request Build" >> $GITHUB_STEP_SUMMARY + echo "This is a validation build. Images are built and tested locally but **not pushed to Docker Hub** for security." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- ✅ Dockerfile syntax validated" >> $GITHUB_STEP_SUMMARY + echo "- ✅ Multi-architecture builds tested" >> $GITHUB_STEP_SUMMARY + echo "- ✅ TestInfra tests executed" >> $GITHUB_STEP_SUMMARY + echo "- ⏭️ Docker Hub push skipped (requires main branch)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + fi + + echo "### Build Summary for ${{ matrix.platform }} 🚀" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### 🔍 Build Information" >> $GITHUB_STEP_SUMMARY + echo "- **Build Status**: ${{ job.status }}" >> $GITHUB_STEP_SUMMARY + echo "- **Platform**: ${{ matrix.platform }}" >> $GITHUB_STEP_SUMMARY + echo "- **Architectures**: amd64, arm64" >> $GITHUB_STEP_SUMMARY + echo "- **Commit SHA**: [\`${{ github.sha }}\`](${{ github.server_url }}/${{ github.repository }}/commit/${{ github.sha }})" >> $GITHUB_STEP_SUMMARY + echo "- **Build Date**: ${{ steps.version.outputs.BUILD_DATE }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY + echo "- Latest tag: \`apache/incubator-cloudberry:cbdb-build-${{ matrix.platform }}-latest\`" >> $GITHUB_STEP_SUMMARY + echo "- Version tag: \`apache/incubator-cloudberry:cbdb-build-${{ matrix.platform }}-${{ steps.version.outputs.BUILD_DATE }}-${{ steps.version.outputs.SHA_SHORT }}\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### 📋 Quick Reference" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY + echo "# Pull the image (automatically selects correct architecture)" >> $GITHUB_STEP_SUMMARY + echo "docker pull apache/incubator-cloudberry:cbdb-build-${{ matrix.platform }}-latest" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "# Pull specific architecture if needed" >> $GITHUB_STEP_SUMMARY + echo "docker pull --platform linux/amd64 apache/incubator-cloudberry:cbdb-build-${{ matrix.platform }}-latest" >> $GITHUB_STEP_SUMMARY + echo "docker pull --platform linux/arm64 apache/incubator-cloudberry:cbdb-build-${{ matrix.platform }}-latest" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/docker-cbdb-test-containers.yml b/.github/workflows/docker-cbdb-test-containers.yml new file mode 100644 index 00000000000..fcee6fa41b6 --- /dev/null +++ b/.github/workflows/docker-cbdb-test-containers.yml @@ -0,0 +1,199 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Purpose: Builds, tests and pushes multi-architecture Docker images for +# Apache Cloudberry DB test environments. Images are built for both AMD64 +# and ARM64 architectures on Rocky Linux 8 and 9. +# +# Images are tagged with: +# - cbdb-test-rocky8-latest +# - cbdb-test-rocky8-{YYYYMMDD}-{git-short-sha} +# - cbdb-test-rocky9-latest +# - cbdb-test-rocky9-{YYYYMMDD}-{git-short-sha} +# +# Features: +# - Multi-architecture support (AMD64 and ARM64) +# - Matrix build for multiple platforms +# - QEMU emulation for cross-platform builds +# - Buildx for efficient multi-arch builds +# - Path filtering to only build changed platforms +# - Comprehensive build summary and metadata +# +# -------------------------------------------------------------------- + +name: docker-cbdb-test-containers + +# Trigger on pushes to docker-images branch when relevant paths change +# Also allows manual triggering via GitHub UI +on: + push: + branches: + - main + paths: + - 'devops/deploy/docker/test/rocky8/**' + - 'devops/deploy/docker/test/rocky9/**' + - 'devops/deploy/docker/test/ubuntu22.04/**' + pull_request: + paths: + - 'devops/deploy/docker/test/**' + workflow_dispatch: # Manual trigger + +# Prevent multiple workflow runs from interfering with each other +concurrency: + group: docker-test-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-and-push: + timeout-minutes: 60 # Prevent hanging builds + runs-on: ubuntu-latest + strategy: + matrix: + # Build for both Rocky Linux 8 and 9, Ubuntu 22.04 + platform: ['rocky8', 'rocky9', 'ubuntu22.04'] + + steps: + # Checkout repository code + - name: Checkout code + uses: actions/checkout@v4 + + # Generate version information for image tags + - name: Set version + id: version + run: | + echo "BUILD_DATE=$(date -u +'%Y%m%d')" >> $GITHUB_OUTPUT + echo "SHA_SHORT=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + + # Determine if the current platform's files have changed + - name: Determine if platform changed + id: platform-filter + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 + with: + filters: | + rocky8: + - 'devops/deploy/docker/test/rocky8/**' + rocky9: + - 'devops/deploy/docker/test/rocky9/**' + ubuntu22.04: + - 'devops/deploy/docker/test/ubuntu22.04/**' + + # Skip if no changes for current platform + - name: Skip if not relevant + if: ${{ steps.platform-filter.outputs[matrix.platform] != 'true' }} + run: echo "Skipping because the changes do not affect this platform" + + # Set up QEMU for multi-architecture support + # This allows building ARM64 images on AMD64 infrastructure and vice versa + - name: Set up QEMU + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' }} + uses: docker/setup-qemu-action@v3 + + # Login to DockerHub for pushing images + - name: Login to Docker Hub + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main' }} + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USER }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + # Setup Docker Buildx for efficient multi-architecture builds + - name: Set up Docker Buildx + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' }} + uses: docker/setup-buildx-action@v3 + with: + buildkitd-flags: --debug + + # Build and test images for each architecture + # This ensures both AMD64 and ARM64 variants work correctly + - name: Build and test images + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' }} + run: | + # Build for each platform + for arch in amd64 arm64; do + echo "Building for $arch architecture..." + docker buildx build \ + --platform linux/$arch \ + --load \ + -t apache/incubator-cloudberry:cbdb-test-${{ matrix.platform }}-$arch-test \ + ./devops/deploy/docker/test/${{ matrix.platform }} + done + + # Build and push multi-architecture images + # Creates a manifest list that supports both architectures + - name: Build and Push Multi-arch Docker images + if: ${{ steps.platform-filter.outputs[matrix.platform] == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main' }} + uses: docker/build-push-action@v6 + with: + context: ./devops/deploy/docker/test/${{ matrix.platform }} + push: true + platforms: linux/amd64,linux/arm64 + # Use caching for faster builds + cache-from: | + type=registry,ref=apache/incubator-cloudberry:cbdb-test-${{ matrix.platform }}-latest + type=gha,scope=docker-cbdb-test-${{ matrix.platform }} + # Tag with both latest and version-specific tags + tags: | + apache/incubator-cloudberry:cbdb-test-${{ matrix.platform }}-latest + apache/incubator-cloudberry:cbdb-test-${{ matrix.platform }}-${{ steps.version.outputs.BUILD_DATE }}-${{ steps.version.outputs.SHA_SHORT }} + # Add metadata labels for better image tracking + labels: | + org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} + org.opencontainers.image.revision=${{ github.sha }} + org.opencontainers.image.created=${{ steps.version.outputs.BUILD_DATE }} + org.opencontainers.image.version=${{ steps.version.outputs.BUILD_DATE }}-${{ steps.version.outputs.SHA_SHORT }} + + # Generate a detailed build summary in GitHub Actions UI + # This provides quick access to build information and image usage instructions + - name: Build Summary + if: always() + run: | + # Add PR context notification + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + echo "#### ℹ️ Pull Request Build" >> $GITHUB_STEP_SUMMARY + echo "This is a validation build. Images are built and tested locally but **not pushed to Docker Hub** for security." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- ✅ Dockerfile syntax validated" >> $GITHUB_STEP_SUMMARY + echo "- ✅ Multi-architecture builds tested" >> $GITHUB_STEP_SUMMARY + echo "- ⏭️ Docker Hub push skipped (requires main branch)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + fi + + echo "### Build Summary for ${{ matrix.platform }} 🚀" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### 🔍 Build Information" >> $GITHUB_STEP_SUMMARY + echo "- **Build Status**: ${{ job.status }}" >> $GITHUB_STEP_SUMMARY + echo "- **Platform**: ${{ matrix.platform }}" >> $GITHUB_STEP_SUMMARY + echo "- **Architectures**: AMD64, ARM64" >> $GITHUB_STEP_SUMMARY + echo "- **Commit SHA**: [\`${{ github.sha }}\`](${{ github.server_url }}/${{ github.repository }}/commit/${{ github.sha }})" >> $GITHUB_STEP_SUMMARY + echo "- **Build Date**: ${{ steps.version.outputs.BUILD_DATE }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY + echo "- Latest tag: \`apache/incubator-cloudberry:cbdb-test-${{ matrix.platform }}-latest\`" >> $GITHUB_STEP_SUMMARY + echo "- Version tag: \`apache/incubator-cloudberry:cbdb-test-${{ matrix.platform }}-${{ steps.version.outputs.BUILD_DATE }}-${{ steps.version.outputs.SHA_SHORT }}\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "#### 📋 Quick Reference" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY + echo "# Pull the image (automatically selects correct architecture)" >> $GITHUB_STEP_SUMMARY + echo "docker pull apache/incubator-cloudberry:cbdb-test-${{ matrix.platform }}-latest" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "# Pull specific architecture if needed" >> $GITHUB_STEP_SUMMARY + echo "docker pull --platform linux/amd64 apache/incubator-cloudberry:cbdb-test-${{ matrix.platform }}-latest" >> $GITHUB_STEP_SUMMARY + echo "docker pull --platform linux/arm64 apache/incubator-cloudberry:cbdb-test-${{ matrix.platform }}-latest" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/sonarqube.yml b/.github/workflows/sonarqube.yml index 6dcbb3eae4e..e67c2d96a54 100644 --- a/.github/workflows/sonarqube.yml +++ b/.github/workflows/sonarqube.yml @@ -32,6 +32,7 @@ # Triggers: # - Weekly schedule # - optional manual dispatch. +# - Changes on sonarqube.yml file # # Notes: # - SONARCLOUD_TOKEN secret is provided by the ASF Infra team @@ -42,6 +43,14 @@ on: schedule: - cron: "0 0 * * 1" workflow_dispatch: + push: + branches: + - main + paths: + - '.github/workflows/sonarqube.yml' + pull_request: + paths: + - '.github/workflows/sonarqube.yml' permissions: contents: read @@ -74,7 +83,7 @@ jobs: fi - name: Install Build Wrapper - uses: SonarSource/sonarqube-scan-action/install-build-wrapper@v5 + uses: SonarSource/sonarqube-scan-action/install-build-wrapper@v6 - name: Run Build Wrapper run: | @@ -112,9 +121,10 @@ jobs: build-wrapper-linux-x86-64 --out-dir ${{ env.BUILD_WRAPPER_OUT_DIR }} make -j$(nproc) - name: SonarQube Scan - uses: SonarSource/sonarqube-scan-action@v5 + if: ${{ github.event_name != 'pull_request' }} + uses: SonarSource/sonarqube-scan-action@v6 env: SONAR_TOKEN: ${{ secrets.SONARCLOUD_TOKEN }} with: args: > - --define sonar.cfamily.compile-commands="${{ env.BUILD_WRAPPER_OUT_DIR }}/compile_commands.json" + --define "sonar.cfamily.compile-commands=${{ env.BUILD_WRAPPER_OUT_DIR }}/compile_commands.json" diff --git a/GNUmakefile.in b/GNUmakefile.in index e6333e39bec..623074305ce 100644 --- a/GNUmakefile.in +++ b/GNUmakefile.in @@ -34,6 +34,9 @@ ifeq ($(with_openssl), yes) endif ifeq ($(enable_pax), yes) $(MAKE) -C contrib/pax_storage all +endif +ifeq ($(enable_ic_udp2),yes) + $(MAKE) -C contrib/udp2 all endif $(MAKE) -C gpMgmt all $(MAKE) -C gpcontrib all @@ -81,6 +84,9 @@ ifeq ($(enable_pax), yes) endif ifeq ($(with_openssl), yes) $(MAKE) -C contrib/sslinfo $@ +endif +ifeq ($(enable_ic_udp2),yes) + $(MAKE) -C contrib/udp2 $@ endif $(MAKE) -C gpMgmt $@ $(MAKE) -C gpcontrib $@ @@ -154,17 +160,17 @@ check-tests installcheck installcheck-parallel installcheck-tests: submake-gener $(MAKE) -C src/test/regress $@ check: - if [ ! -f $(prefix)/greenplum_path.sh ]; then \ + if [ ! -f $(prefix)/cloudberry-env.sh ]; then \ $(MAKE) -C $(top_builddir) install; \ fi - . $(prefix)/greenplum_path.sh; \ + . $(prefix)/cloudberry-env.sh; \ if pg_isready 1>/dev/null; then \ $(MAKE) -C $(top_builddir) installcheck; \ else \ if [ ! -f $(top_builddir)/gpAux/gpdemo/gpdemo-env.sh ]; then \ - . $(prefix)/greenplum_path.sh && $(MAKE) -C $(top_builddir) create-demo-cluster; \ + . $(prefix)/cloudberry-env.sh && $(MAKE) -C $(top_builddir) create-demo-cluster; \ fi; \ - . $(prefix)/greenplum_path.sh && . $(top_builddir)/gpAux/gpdemo/gpdemo-env.sh && $(MAKE) -C $(top_builddir) installcheck; \ + . $(prefix)/cloudberry-env.sh && . $(top_builddir)/gpAux/gpdemo/gpdemo-env.sh && $(MAKE) -C $(top_builddir) installcheck; \ fi $(call recurse,check-world,src/test src/pl src/interfaces/ecpg contrib src/bin gpcontrib,check) @@ -212,6 +218,11 @@ installcheck-gpcheckcat: $(call recurse,installcheck-world,gpcontrib/gp_replica_check,installcheck) $(call recurse,installcheck-world,src/bin/pg_upgrade,check) +.PHONY: installcheck-hot-standby +installcheck-hot-standby: submake-generated-headers + $(MAKE) -C src/test/regress installcheck-hot-standby + $(MAKE) -C src/test/isolation2 installcheck-hot-standby + # Run mock tests, that don't require a running server. Arguably these should # be part of [install]check-world, but we treat them more like part of # compilation than regression testing, in the CI. But they are too heavy-weight diff --git a/NOTICE b/NOTICE index 8003e980d0d..13e79d82ce5 100644 --- a/NOTICE +++ b/NOTICE @@ -1,5 +1,5 @@ Apache Cloudberry (Incubating) -Copyright 2024-2025 The Apache Software Foundation +Copyright 2024-2026 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). \ No newline at end of file diff --git a/README.apache.md b/README.apache.md index 18772c9dbae..e99f2b2328f 100644 --- a/README.apache.md +++ b/README.apache.md @@ -38,3 +38,15 @@ The following entities have contributed to the Greenplum-based source code under - Broadcom Inc. RAT matchers are used to classify their license headers accordingly. + +## Compressed Files in Source + +The following compressed files are included in the source tree. These files are archives of text files used for testing purposes and do not contain binary executables. They are not used during the build process. + +- contrib/formatter_fixedwidth/data/fixedwidth_small_correct.tbl.gz +- gpMgmt/demo/gppkg/sample-sources.tar.gz +- src/bin/gpfdist/regress/data/exttab1/nation.tbl.gz +- src/bin/gpfdist/regress/data/gpfdist2/gz_multi_chunk.tbl.gz +- src/bin/gpfdist/regress/data/gpfdist2/gz_multi_chunk_2.tbl.gz +- src/bin/gpfdist/regress/data/gpfdist2/lineitem.tbl.bz2 +- src/bin/gpfdist/regress/data/gpfdist2/lineitem.tbl.gz diff --git a/README.md b/README.md index 25771f24a10..a72c42fd266 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ SonarQube Cloud [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/apache/cloudberry) +[![Apache Rat Audit](https://github.com/apache/cloudberry/actions/workflows/apache-rat-audit.yml/badge.svg)](https://github.com/apache/cloudberry/actions/workflows/apache-rat-audit.yml) --------- ## Introduction @@ -61,15 +62,14 @@ also be used for large-scale analytics and AI/ML workloads. ### Build from source -You can follow [these guides](./deploy/build) to build Cloudberry on +You can follow [these guides](https://cloudberry.apache.org/docs/deployment/) to build Cloudberry on Linux OS (including RHEL/Rocky Linux, and Ubuntu) and macOS. ### Try out quickly Welcome to try out Cloudberry via building [one Docker-based -Sandbox](https://github.com/apache/cloudberry-bootcamp), which is tailored to -help you gain a basic understanding of Cloudberry's capabilities and features -a range of materials, including tutorials, sample code, and crash courses. +Sandbox](./devops/sandbox), which is tailored to help you gain a basic +understanding of Cloudberry's capabilities and features. ## Repositories @@ -78,10 +78,8 @@ this, there are several ecosystem repositories for Cloudberry, including the website, extensions, connectors, adapters, and other utilities. * [apache/cloudberry-site](https://github.com/apache/cloudberry-site): website and documentation sources. -* [apache/cloudberry-bootcamp](https://github.com/apache/cloudberry-bootcamp): help you quickly try out Cloudberry via one Docker-based Sandbox. -* [apache/cloudberry-gpbackup](https://github.com/apache/cloudberry-gpbackup): backup utility for Cloudberry. +* [apache/cloudberry-backup](https://github.com/apache/cloudberry-backup): backup utility for Cloudberry. * [apache/cloudberry-go-libs](https://github.com/apache/cloudberry-go-libs): go-libs for Cloudberry. -* [apache/cloudberry-gpbackup-s3-plugin](https://github.com/apache/cloudberry-gpbackup-s3-plugin): S3 plugin for use with Cloudberry backup utility. * [apache/cloudberry-pxf](https://github.com/apache/cloudberry-pxf): Platform Extension Framework (PXF) for Cloudberry. ## Community & Support diff --git a/configure b/configure index 4297c803903..ab82db7d011 100755 --- a/configure +++ b/configure @@ -665,6 +665,7 @@ with_ssl have_yaml YAML_LIBS EVENT_LIBS +EVENT_CFLAGS apr_cppflags apr_cflags apr_link_ld_libs @@ -719,7 +720,6 @@ GREP with_apr_config with_libcurl with_rt -with_quicklz with_zstd with_libbz2 LZ4_LIBS @@ -761,9 +761,11 @@ PROTOBUF_LIBS PROTOBUF_CFLAGS enable_preload_ic_module enable_ic_proxy +enable_ic_udp2 enable_external_fts HAVE_CXX14 enable_gpcloud +enable_link_postgres_with_shared enable_shared_postgres_backend enable_mapreduce enable_serverless @@ -906,8 +908,10 @@ enable_catalog_ext enable_serverless enable_mapreduce enable_shared_postgres_backend +enable_link_postgres_with_shared enable_gpcloud enable_external_fts +enable_ic_udp2 enable_ic_proxy enable_preload_ic_module enable_pax @@ -938,7 +942,6 @@ with_zlib with_lz4 with_libbz2 with_zstd -with_quicklz with_rt with_libcurl with_apr_config @@ -980,6 +983,8 @@ LDFLAGS_EX LDFLAGS_SL PERL PYTHON +EVENT_CFLAGS +EVENT_LIBS MSGFMT TCLSH' @@ -1633,8 +1638,12 @@ Optional Features: --enable-mapreduce enable Cloudberry Mapreduce support --disable-shared-postgres-backend enable Cloudberry shared postgres backend support + --enable-link-postgres-with-shared + build Cloudberry using the shared library + libpostgres.so --enable-gpcloud enable gpcloud support --enable-external-fts enable external fts support + --enable-ic-udp2 enable interconnect udp2 implement --enable-ic-proxy enable interconnect proxy mode (requires libuv library) --disable-preload-ic-module @@ -1692,8 +1701,6 @@ Optional Packages: --with-lz4 build with LZ4 support --without-libbz2 do not use bzip2 --without-zstd do not build with Zstandard - --with-quicklz build with QuickLZ support (requires quicklz - library) --without-rt do not use Realtime Library --without-libcurl do not use libcurl --with-apr-config=PATH path to apr-1-config utility @@ -1736,6 +1743,9 @@ Some influential environment variables: LDFLAGS_SL extra linker flags for linking shared libraries only PERL Perl program PYTHON Python program + EVENT_CFLAGS + C compiler flags for EVENT, overriding pkg-config + EVENT_LIBS linker flags for EVENT, overriding pkg-config MSGFMT msgfmt program for NLS TCLSH Tcl interpreter program (tclsh) @@ -8592,6 +8602,45 @@ if test "$enable_shared_postgres_backend" = yes ; then : fi # fi +# +# --enable-link-postgres-with-shared enables linking postgres with shared library libpostgres.so +# + + +# Check whether --enable-link-postgres-with-shared was given. +if test "${enable_link_postgres_with_shared+set}" = set; then : + enableval=$enable_link_postgres_with_shared; + case $enableval in + yes) + +$as_echo "#define USE_LINK_POSTGRES_WITH_SHARED 1" >>confdefs.h + + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --enable-link-postgres-with-shared option" "$LINENO" 5 + ;; + esac + +else + enable_link_postgres_with_shared=no + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: checking whether to build Cloudberry using the shared library libpostgres.so... $enable_link_postgres_with_shared" >&5 +$as_echo "checking whether to build Cloudberry using the shared library libpostgres.so... $enable_link_postgres_with_shared" >&6; } + + +# check if enable-shared-postgres-backend is yes when enable-link-postgres-with-shared is yes +if test "$enable_link_postgres_with_shared" = yes; then + if test "$enable_shared_postgres_backend" = no; then + as_fn_error $? "--enable-link-postgres-with-shared is yes, but --enable-shared-postgres-backend is no" "$LINENO" 5 + fi +fi + # # gpcloud, enabled by default # @@ -9141,11 +9190,44 @@ $as_echo "checking whether to build with external-fts... $enable_external_fts" > if test "$enable_external_fts" = no; then - $as_echo "#define USE_INTERNAL_FTS 1" >>confdefs.h + +$as_echo "#define USE_INTERNAL_FTS 1" >>confdefs.h CFLAGS="$CFLAGS -DUSE_INTERNAL_FTS=1" fi +# +# ic-udp2 +# + + +# Check whether --enable-ic-udp2 was given. +if test "${enable_ic_udp2+set}" = set; then : + enableval=$enable_ic_udp2; + case $enableval in + yes) + +$as_echo "#define ENABLE_IC_UDP2 1" >>confdefs.h + + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --enable-ic-udp2 option" "$LINENO" 5 + ;; + esac + +else + enable_ic_udp2=no + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: checking whether to build with interconnect udp2 support ... $enable_ic_udp2" >&5 +$as_echo "checking whether to build with interconnect udp2 support ... $enable_ic_udp2" >&6; } + + # # ic-proxy # @@ -9393,6 +9475,55 @@ $as_echo "#define HAVE_ZSTD 1" >>confdefs.h fi + # Check liburing + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for io_uring_queue_init in -luring" >&5 +$as_echo_n "checking for io_uring_queue_init in -luring... " >&6; } +if ${ac_cv_lib_uring_io_uring_queue_init+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-luring $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char io_uring_queue_init (); +int +main () +{ +return io_uring_queue_init (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_uring_io_uring_queue_init=yes +else + ac_cv_lib_uring_io_uring_queue_init=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_uring_io_uring_queue_init" >&5 +$as_echo "$ac_cv_lib_uring_io_uring_queue_init" >&6; } +if test "x$ac_cv_lib_uring_io_uring_queue_init" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBURING 1 +_ACEOF + + LIBS="-luring $LIBS" + +else + as_fn_error $? "library 'uring' is required for PAX support" "$LINENO" 5 +fi + + # Check cmake >= 3.11.0 using AX_COMPARE_VERSION # Extract the first word of "cmake", so it can be a program name with args. set dummy cmake; ac_word=$2 @@ -11029,35 +11160,6 @@ $as_echo "yes" >&6; } fi fi -# -# quicklz -# - - - -# Check whether --with-quicklz was given. -if test "${with_quicklz+set}" = set; then : - withval=$with_quicklz; - case $withval in - yes) - : - ;; - no) - : - ;; - *) - as_fn_error $? "no argument expected for --with-quicklz option" "$LINENO" 5 - ;; - esac - -else - with_quicklz=no - -fi - - - - # # Realtime library # @@ -14678,56 +14780,6 @@ fi fi -if test "$with_quicklz" = yes; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for qlz_compress in -lquicklz" >&5 -$as_echo_n "checking for qlz_compress in -lquicklz... " >&6; } -if ${ac_cv_lib_quicklz_qlz_compress+:} false; then : - $as_echo_n "(cached) " >&6 -else - ac_check_lib_save_LIBS=$LIBS -LIBS="-lquicklz $LIBS" -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -/* Override any GCC internal prototype to avoid an error. - Use char because int might match the return type of a GCC - builtin and then its argument prototype would still apply. */ -#ifdef __cplusplus -extern "C" -#endif -char qlz_compress (); -int -main () -{ -return qlz_compress (); - ; - return 0; -} -_ACEOF -if ac_fn_c_try_link "$LINENO"; then : - ac_cv_lib_quicklz_qlz_compress=yes -else - ac_cv_lib_quicklz_qlz_compress=no -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext conftest.$ac_ext -LIBS=$ac_check_lib_save_LIBS -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_quicklz_qlz_compress" >&5 -$as_echo "$ac_cv_lib_quicklz_qlz_compress" >&6; } -if test "x$ac_cv_lib_quicklz_qlz_compress" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_LIBQUICKLZ 1 -_ACEOF - - LIBS="-lquicklz $LIBS" - -else - as_fn_error $? "quicklz library not found." "$LINENO" 5 -fi - -fi - if test "$enable_external_fts" = yes; then { $as_echo "$as_me:${as_lineno-$LINENO}: checking for jansson_version_str in -ljansson" >&5 $as_echo_n "checking for jansson_version_str in -ljansson... " >&6; } @@ -15059,65 +15111,78 @@ fi LIBS="$_LIBS" _LIBS="$LIBS" - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing event_add" >&5 -$as_echo_n "checking for library containing event_add... " >&6; } -if ${ac_cv_search_event_add+:} false; then : - $as_echo_n "(cached) " >&6 -else - ac_func_search_save_LIBS=$LIBS -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -/* Override any GCC internal prototype to avoid an error. - Use char because int might match the return type of a GCC - builtin and then its argument prototype would still apply. */ -#ifdef __cplusplus -extern "C" -#endif -char event_add (); -int -main () -{ -return event_add (); - ; - return 0; -} -_ACEOF -for ac_lib in '' event; do - if test -z "$ac_lib"; then - ac_res="none required" - else - ac_res=-l$ac_lib - LIBS="-l$ac_lib $ac_func_search_save_LIBS" - fi - if ac_fn_c_try_link "$LINENO"; then : - ac_cv_search_event_add=$ac_res +pkg_failed=no +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for libevent >= 2.0.0" >&5 +$as_echo_n "checking for libevent >= 2.0.0... " >&6; } + +if test -n "$EVENT_CFLAGS"; then + pkg_cv_EVENT_CFLAGS="$EVENT_CFLAGS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libevent >= 2.0.0\""; } >&5 + ($PKG_CONFIG --exists --print-errors "libevent >= 2.0.0") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_EVENT_CFLAGS=`$PKG_CONFIG --cflags "libevent >= 2.0.0" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext - if ${ac_cv_search_event_add+:} false; then : - break + else + pkg_failed=untried fi -done -if ${ac_cv_search_event_add+:} false; then : - +if test -n "$EVENT_LIBS"; then + pkg_cv_EVENT_LIBS="$EVENT_LIBS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libevent >= 2.0.0\""; } >&5 + ($PKG_CONFIG --exists --print-errors "libevent >= 2.0.0") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_EVENT_LIBS=`$PKG_CONFIG --libs "libevent >= 2.0.0" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes else - ac_cv_search_event_add=no + pkg_failed=yes fi -rm conftest.$ac_ext -LIBS=$ac_func_search_save_LIBS + else + pkg_failed=untried fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_event_add" >&5 -$as_echo "$ac_cv_search_event_add" >&6; } -ac_res=$ac_cv_search_event_add -if test "$ac_res" != no; then : - test "$ac_res" = "none required" || LIBS="$ac_res $LIBS" + + +if test $pkg_failed = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes else - as_fn_error $? "libevent is required for gpfdist" "$LINENO" 5 + _pkg_short_errors_supported=no fi + if test $_pkg_short_errors_supported = yes; then + EVENT_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libevent >= 2.0.0" 2>&1` + else + EVENT_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libevent >= 2.0.0" 2>&1` + fi + # Put the nasty error message in config.log where it belongs + echo "$EVENT_PKG_ERRORS" >&5 - EVENT_LIBS=" -levent" + as_fn_error $? "libevent >= 2.0.0 is required for gpfdist" "$LINENO" 5 +elif test $pkg_failed = untried; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + as_fn_error $? "libevent >= 2.0.0 is required for gpfdist" "$LINENO" 5 +else + EVENT_CFLAGS=$pkg_cv_EVENT_CFLAGS + EVENT_LIBS=$pkg_cv_EVENT_LIBS + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +fi + EVENT_LIBS="$EVENT_LIBS" { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing yaml_parser_initialize" >&5 @@ -15761,7 +15826,8 @@ if test "$enable_openssl_redirect" = yes; then as_fn_error $? "--enable-openssl-redirect must specify --with-ssl" "$LINENO" 5 fi - $as_echo "#define OPENSSL_ALLOW_REDIRECT 1" >>confdefs.h + +$as_echo "#define OPENSSL_ALLOW_REDIRECT 1" >>confdefs.h fi @@ -16196,6 +16262,13 @@ fi +# for contrib/pax_storage +if test "$enable_pax" = yes; then + if test "$enable_shared_postgres_backend" = no; then + as_fn_error $? "PAX support requires --enable-shared-postgres-backend" "$LINENO" 5 + fi +fi + # for contrib/sepgsql if test "$with_selinux" = yes; then { $as_echo "$as_me:${as_lineno-$LINENO}: checking for security_compute_create_name in -lselinux" >&5 @@ -16975,18 +17048,6 @@ else fi -fi - -# Check for quicklz.h -if test "$with_quicklz" = yes; then - ac_fn_c_check_header_mongrel "$LINENO" "quicklz.h" "ac_cv_header_quicklz_h" "$ac_includes_default" -if test "x$ac_cv_header_quicklz_h" = xyes; then : - -else - as_fn_error $? "header file is required for QuickLZ support" "$LINENO" 5 -fi - - fi if test "$enable_external_fts" = yes; then diff --git a/configure.ac b/configure.ac index 2732c728eb5..cc75aee241f 100644 --- a/configure.ac +++ b/configure.ac @@ -871,7 +871,7 @@ AC_SUBST(enable_mapreduce) # PGAC_ARG_BOOL(enable, shared-postgres-backend, yes, [enable Cloudberry shared postgres backend support], [AC_DEFINE([USE_SHARED_POSTGRES_BACKEND], 1, - [Define to 1 to build with shared postgres backend capabilities (--enable-shared-postgres-backend)])]) + [Define to 1 to build with shared Cloudberry backend capabilities (--enable-shared-postgres-backend)])]) AC_MSG_RESULT([checking whether to build with Cloudberry shared postgres backend... $enable_shared_postgres_backend]) AC_SUBST(enable_shared_postgres_backend) @@ -881,6 +881,22 @@ AS_IF([test "$enable_shared_postgres_backend" = yes ], CXXFLAGS="$CXXFLAGS -fPIC " ]) # fi +# +# --enable-link-postgres-with-shared enables linking postgres with shared library libpostgres.so +# +PGAC_ARG_BOOL(enable, link-postgres-with-shared, no, [build Cloudberry using the shared library libpostgres.so], + [AC_DEFINE([USE_LINK_POSTGRES_WITH_SHARED], 1, + [Define to 1 to build Cloudberry using the shared library libpostgres.so (--enable-link-postgres-with-shared)])]) +AC_MSG_RESULT([checking whether to build Cloudberry using the shared library libpostgres.so... $enable_link_postgres_with_shared]) +AC_SUBST(enable_link_postgres_with_shared) + +# check if enable-shared-postgres-backend is yes when enable-link-postgres-with-shared is yes +if test "$enable_link_postgres_with_shared" = yes; then + if test "$enable_shared_postgres_backend" = no; then + AC_MSG_ERROR([--enable-link-postgres-with-shared is yes, but --enable-shared-postgres-backend is no]) + fi +fi + # # gpcloud, enabled by default # @@ -903,10 +919,20 @@ AC_MSG_RESULT([checking whether to build with external-fts... $enable_external_f AC_SUBST(enable_external_fts) if test "$enable_external_fts" = no; then - AC_DEFINE([USE_INTERNAL_FTS], [1]) + AC_DEFINE([USE_INTERNAL_FTS], [1], [Define to 1 to use the internal FTS implementation.]) CFLAGS="$CFLAGS -DUSE_INTERNAL_FTS=1" fi +# +# ic-udp2 +# +PGAC_ARG_BOOL(enable, ic-udp2, no, + [enable interconnect udp2 implement], + [AC_DEFINE(ENABLE_IC_UDP2, 1, + [Define to 1 to build with interconnect udp2 support (--enable-ic-udp2)])]) +AC_MSG_RESULT([checking whether to build with interconnect udp2 support ... $enable_ic_udp2]) +AC_SUBST(enable_ic_udp2) + # # ic-proxy # @@ -946,6 +972,10 @@ if test "$enable_pax" = yes; then [AC_MSG_ERROR([libzstd >= 1.4.0 is required for PAX support])] ) + # Check liburing + AC_CHECK_LIB(uring, io_uring_queue_init, [], + [AC_MSG_ERROR([library 'uring' is required for PAX support])]) + # Check cmake >= 3.11.0 using AX_COMPARE_VERSION AC_PATH_PROG([CMAKE], [cmake], [no]) if test "$CMAKE" = "no"; then @@ -1340,13 +1370,6 @@ if test "$with_zstd" = yes; then PKG_CHECK_MODULES([ZSTD], [libzstd >= 1.4.0]) fi -# -# quicklz -# -PGAC_ARG_BOOL(with, quicklz, no, - [build with QuickLZ support (requires quicklz library)]) -AC_SUBST(with_quicklz) - # # Realtime library # @@ -1597,11 +1620,6 @@ failure. It is possible the compiler isn't looking in the proper directory. Use --without-zlib to disable zlib support.])]) fi -if test "$with_quicklz" = yes; then - AC_CHECK_LIB(quicklz, qlz_compress, [], - [AC_MSG_ERROR([quicklz library not found.])]) -fi - if test "$enable_external_fts" = yes; then AC_CHECK_LIB(jansson, jansson_version_str, [], [AC_MSG_ERROR([jansson library not found or version is too old, version must >= 2.13])]) @@ -1651,8 +1669,8 @@ AC_DEFUN([CHECK_APR], [ if test "$enable_gpfdist" = yes ; then CHECK_APR() _LIBS="$LIBS" - AC_SEARCH_LIBS(event_add, [event], [], [AC_MSG_ERROR([libevent is required for gpfdist])]) - EVENT_LIBS=" -levent" + PKG_CHECK_MODULES([EVENT], [libevent >= 2.0.0], [], [AC_MSG_ERROR([libevent >= 2.0.0 is required for gpfdist])]) + EVENT_LIBS="$EVENT_LIBS" AC_SUBST(EVENT_LIBS) AC_SEARCH_LIBS(yaml_parser_initialize, [yaml], [have_yaml=yes; YAML_LIBS=" -lyaml"], [AC_MSG_WARN([libyaml is not found. disabling transformations for gpfdist.])]) @@ -1738,7 +1756,7 @@ if test "$enable_openssl_redirect" = yes; then AC_MSG_ERROR([--enable-openssl-redirect must specify --with-ssl]) fi - AC_DEFINE([OPENSSL_ALLOW_REDIRECT], [1]) + AC_DEFINE([OPENSSL_ALLOW_REDIRECT], [1], [Define to 1 if you want to allow OpenSSL redirects.]) fi # Check for curl. @@ -1802,6 +1820,13 @@ fi AC_SUBST(LDAP_LIBS_FE) AC_SUBST(LDAP_LIBS_BE) +# for contrib/pax_storage +if test "$enable_pax" = yes; then + if test "$enable_shared_postgres_backend" = no; then + AC_MSG_ERROR([PAX support requires --enable-shared-postgres-backend]) + fi +fi + # for contrib/sepgsql if test "$with_selinux" = yes; then AC_CHECK_LIB(selinux, security_compute_create_name, [], @@ -1965,11 +1990,6 @@ if test "$with_libbz2" = yes ; then AC_CHECK_HEADER(bzlib.h, [], [AC_MSG_ERROR([header file is required for bzip2 support])], []) fi -# Check for quicklz.h -if test "$with_quicklz" = yes; then - AC_CHECK_HEADER(quicklz.h, [], [AC_MSG_ERROR([header file is required for QuickLZ support])]) -fi - if test "$enable_external_fts" = yes; then # Check for jansson AC_CHECK_HEADER(jansson.h, [], [AC_MSG_ERROR([header file is required for ETCD support])]) diff --git a/contrib/Makefile b/contrib/Makefile index 2292adb88f2..b14600e3557 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -104,6 +104,12 @@ else ALWAYS_SUBDIRS += pax_storage endif +ifeq ($(enable_ic_udp2),yes) +SUBDIRS += udp2 +else +ALWAYS_SUBDIRS += udp2 +endif + # Missing: # start-scripts \ (does not have a makefile) diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out index b6e98eaa68d..cf9ff0941d0 100644 --- a/contrib/amcheck/expected/check_heap.out +++ b/contrib/amcheck/expected/check_heap.out @@ -227,7 +227,7 @@ CREATE TABLE test_partitioned (a int, b text default repeat('x', 5000)) SELECT * FROM verify_heapam('test_partitioned', startblock := NULL, endblock := NULL); -ERROR: "test_partitioned" is not a table, materialized view, or TOAST table +ERROR: "test_partitioned" is not a table, directory table, materialized view, or TOAST table CONTEXT: SQL statement "SELECT t.blkno, t.offnum, t.attnum, t.msg FROM verify_heapam_internal(relation, on_error_stop, @@ -268,7 +268,7 @@ CREATE INDEX test_index ON test_partition (a); SELECT * FROM verify_heapam('test_index', startblock := NULL, endblock := NULL); -ERROR: "test_index" is not a table, materialized view, or TOAST table +ERROR: "test_index" is not a table, directory table, materialized view, or TOAST table CONTEXT: SQL statement "SELECT t.blkno, t.offnum, t.attnum, t.msg FROM verify_heapam_internal(relation, on_error_stop, @@ -283,7 +283,7 @@ CREATE VIEW test_view AS SELECT 1; SELECT * FROM verify_heapam('test_view', startblock := NULL, endblock := NULL); -ERROR: "test_view" is not a table, materialized view, or TOAST table +ERROR: "test_view" is not a table, directory table, materialized view, or TOAST table CONTEXT: SQL statement "SELECT t.blkno, t.offnum, t.attnum, t.msg FROM verify_heapam_internal(relation, on_error_stop, @@ -298,7 +298,7 @@ CREATE SEQUENCE test_sequence; SELECT * FROM verify_heapam('test_sequence', startblock := NULL, endblock := NULL); -ERROR: "test_sequence" is not a table, materialized view, or TOAST table +ERROR: "test_sequence" is not a table, directory table, materialized view, or TOAST table CONTEXT: SQL statement "SELECT t.blkno, t.offnum, t.attnum, t.msg FROM verify_heapam_internal(relation, on_error_stop, @@ -315,7 +315,7 @@ CREATE FOREIGN TABLE test_foreign_table () SERVER dummy_server; SELECT * FROM verify_heapam('test_foreign_table', startblock := NULL, endblock := NULL); -ERROR: "test_foreign_table" is not a table, materialized view, or TOAST table +ERROR: "test_foreign_table" is not a table, directory table, materialized view, or TOAST table CONTEXT: SQL statement "SELECT t.blkno, t.offnum, t.attnum, t.msg FROM verify_heapam_internal(relation, on_error_stop, diff --git a/contrib/amcheck/t/001_verify_heapam.pl b/contrib/amcheck/t/001_verify_heapam.pl index 64ba64d6b41..b9dbb42a67f 100644 --- a/contrib/amcheck/t/001_verify_heapam.pl +++ b/contrib/amcheck/t/001_verify_heapam.pl @@ -31,18 +31,18 @@ # fresh_test_table('test'); corrupt_first_page('test'); -detects_heap_corruption("verify_heapam('test')", "plain corrupted table"); +detects_heap_corruption("verify_heapam_internal('test')", "plain corrupted table"); detects_heap_corruption( - "verify_heapam('test', skip := 'all-visible')", + "verify_heapam_internal('test', skip := 'all-visible')", "plain corrupted table skipping all-visible"); detects_heap_corruption( - "verify_heapam('test', skip := 'all-frozen')", + "verify_heapam_internal('test', skip := 'all-frozen')", "plain corrupted table skipping all-frozen"); detects_heap_corruption( - "verify_heapam('test', check_toast := false)", + "verify_heapam_internal('test', check_toast := false)", "plain corrupted table skipping toast"); detects_heap_corruption( - "verify_heapam('test', startblock := 0, endblock := 0)", + "verify_heapam_internal('test', startblock := 0, endblock := 0)", "plain corrupted table checking only block zero"); # @@ -50,13 +50,13 @@ # fresh_test_table('test'); $node->safe_psql('postgres', q(VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) test)); -detects_no_corruption("verify_heapam('test')", +detects_no_corruption("verify_heapam_internal('test')", "all-frozen not corrupted table"); corrupt_first_page('test'); -detects_heap_corruption("verify_heapam('test')", +detects_heap_corruption("verify_heapam_internal('test')", "all-frozen corrupted table"); detects_no_corruption( - "verify_heapam('test', skip := 'all-frozen')", + "verify_heapam_internal('test', skip := 'all-frozen')", "all-frozen corrupted table skipping all-frozen"); # Returns the filesystem path for the named relation. @@ -208,7 +208,7 @@ sub check_all_options_uncorrupted . "endblock := $endblock"; detects_no_corruption( - "verify_heapam('$relname', $opts)", + "verify_heapam_internal('$relname', $opts)", "$prefix: $opts"); } } diff --git a/contrib/amcheck/t/003_cic_2pc.pl b/contrib/amcheck/t/003_cic_2pc.pl index 445aabaa5e9..eb4d9361955 100644 --- a/contrib/amcheck/t/003_cic_2pc.pl +++ b/contrib/amcheck/t/003_cic_2pc.pl @@ -9,7 +9,11 @@ use PostgresNode; use TestLib; -use Test::More tests => 5; +use Test::More tests => 0 + 1;#5; + +SKIP: +{ + skip "TWO PHASE transactions are not supported in Cloudberry, skip test", 1; Test::More->builder->todo_start('filesystem bug') if TestLib::has_wal_read_bug; @@ -186,3 +190,4 @@ $node->stop; done_testing(); +} diff --git a/contrib/btree_gin/expected/enum.out b/contrib/btree_gin/expected/enum.out index 2b0ddf11eb3..c4ac1174ea2 100644 --- a/contrib/btree_gin/expected/enum.out +++ b/contrib/btree_gin/expected/enum.out @@ -1,14 +1,11 @@ set enable_seqscan=off; CREATE TYPE rainbow AS ENUM ('r','o','y','g','b','i','v'); CREATE TABLE test_enum ( - h int, i rainbow ); -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'h' as the Greenplum Database data distribution key for this table. -HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. -INSERT INTO test_enum VALUES (1, 'v'),(2, 'y'),(3, 'r'),(4, 'g'),(5, 'o'),(6, 'i'),(7, 'b'); +INSERT INTO test_enum VALUES ('v'),('y'),('r'),('g'),('o'),('i'),('b'); CREATE INDEX idx_enum ON test_enum USING gin (i); -SELECT i FROM test_enum WHERE i<'g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i<'g'::rainbow ORDER BY i; i --- r @@ -16,7 +13,7 @@ SELECT i FROM test_enum WHERE i<'g'::rainbow ORDER BY i; y (3 rows) -SELECT i FROM test_enum WHERE i<='g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i<='g'::rainbow ORDER BY i; i --- r @@ -25,13 +22,13 @@ SELECT i FROM test_enum WHERE i<='g'::rainbow ORDER BY i; g (4 rows) -SELECT i FROM test_enum WHERE i='g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i='g'::rainbow ORDER BY i; i --- g (1 row) -SELECT i FROM test_enum WHERE i>='g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i>='g'::rainbow ORDER BY i; i --- g @@ -40,7 +37,7 @@ SELECT i FROM test_enum WHERE i>='g'::rainbow ORDER BY i; v (4 rows) -SELECT i FROM test_enum WHERE i>'g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i>'g'::rainbow ORDER BY i; i --- b @@ -48,7 +45,7 @@ SELECT i FROM test_enum WHERE i>'g'::rainbow ORDER BY i; v (3 rows) -explain (costs off) SELECT i FROM test_enum WHERE i>='g'::rainbow ORDER BY i; +explain (costs off) SELECT * FROM test_enum WHERE i>='g'::rainbow ORDER BY i; QUERY PLAN ----------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) @@ -59,17 +56,11 @@ explain (costs off) SELECT i FROM test_enum WHERE i>='g'::rainbow ORDER BY i; Recheck Cond: (i >= 'g'::rainbow) -> Bitmap Index Scan on idx_enum Index Cond: (i >= 'g'::rainbow) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: Postgres query optimizer (9 rows) -- make sure we handle the non-evenly-numbered oid case for enums create type e as enum ('0', '2', '3'); alter type e add value '1' after '0'; -CREATE TABLE t ( - h int, - i e -); -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'h' as the Greenplum Database data distribution key for this table. -HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. -insert into t select j, (j % 4)::text::e from generate_series(0, 100000) as j; -create index on t using gin (i); +create table t as select (i % 4)::text::e from generate_series(0, 100000) as i; +create index on t using gin (e); diff --git a/contrib/btree_gin/sql/enum.sql b/contrib/btree_gin/sql/enum.sql index 9876be88c85..f35162f8f58 100644 --- a/contrib/btree_gin/sql/enum.sql +++ b/contrib/btree_gin/sql/enum.sql @@ -1,30 +1,25 @@ set enable_seqscan=off; + CREATE TYPE rainbow AS ENUM ('r','o','y','g','b','i','v'); CREATE TABLE test_enum ( - h int, i rainbow ); -INSERT INTO test_enum VALUES (1, 'v'),(2, 'y'),(3, 'r'),(4, 'g'),(5, 'o'),(6, 'i'),(7, 'b'); +INSERT INTO test_enum VALUES ('v'),('y'),('r'),('g'),('o'),('i'),('b'); CREATE INDEX idx_enum ON test_enum USING gin (i); -SELECT i FROM test_enum WHERE i<'g'::rainbow ORDER BY i; -SELECT i FROM test_enum WHERE i<='g'::rainbow ORDER BY i; -SELECT i FROM test_enum WHERE i='g'::rainbow ORDER BY i; -SELECT i FROM test_enum WHERE i>='g'::rainbow ORDER BY i; -SELECT i FROM test_enum WHERE i>'g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i<'g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i<='g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i='g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i>='g'::rainbow ORDER BY i; +SELECT * FROM test_enum WHERE i>'g'::rainbow ORDER BY i; -explain (costs off) SELECT i FROM test_enum WHERE i>='g'::rainbow ORDER BY i; +explain (costs off) SELECT * FROM test_enum WHERE i>='g'::rainbow ORDER BY i; -- make sure we handle the non-evenly-numbered oid case for enums create type e as enum ('0', '2', '3'); alter type e add value '1' after '0'; - -CREATE TABLE t ( - h int, - i e -); -insert into t select j, (j % 4)::text::e from generate_series(0, 100000) as j; -create index on t using gin (i); +create table t as select (i % 4)::text::e from generate_series(0, 100000) as i; +create index on t using gin (e); diff --git a/contrib/btree_gist/expected/bit_optimizer.out b/contrib/btree_gist/expected/bit_optimizer.out index e4eff0ede1f..16ea5206989 100644 --- a/contrib/btree_gist/expected/bit_optimizer.out +++ b/contrib/btree_gist/expected/bit_optimizer.out @@ -74,6 +74,6 @@ SELECT a FROM bittmp WHERE a BETWEEN '1000000' and '1000001'; -> Index Scan using bitidx on bittmp Index Cond: ((a >= '1000000'::"bit") AND (a <= '1000001'::"bit")) Filter: ((a >= '1000000'::"bit") AND (a <= '1000001'::"bit")) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (5 rows) diff --git a/contrib/btree_gist/expected/bytea_optimizer.out b/contrib/btree_gist/expected/bytea_optimizer.out index f5bdcf10d81..9577112dc0b 100644 --- a/contrib/btree_gist/expected/bytea_optimizer.out +++ b/contrib/btree_gist/expected/bytea_optimizer.out @@ -81,7 +81,7 @@ SELECT a FROM byteatmp where a > 'ffa'::bytea; -> Index Scan using byteaidx on byteatmp Index Cond: (a > '\x666661'::bytea) Filter: (a > '\x666661'::bytea) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (5 rows) SELECT a FROM byteatmp where a > 'ffa'::bytea; diff --git a/contrib/btree_gist/expected/cash_optimizer.out b/contrib/btree_gist/expected/cash_optimizer.out index 3eaa66958ea..171dec7e511 100644 --- a/contrib/btree_gist/expected/cash_optimizer.out +++ b/contrib/btree_gist/expected/cash_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '21472.79' FROM moneytmp ORDER BY a <-> '21472.79' LIMIT 3; -> Result -> Gather Motion 3:1 (slice1; segments: 3) -> Seq Scan on moneytmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '21472.79' FROM moneytmp ORDER BY a <-> '21472.79' LIMIT 3; diff --git a/contrib/btree_gist/expected/char_optimizer.out b/contrib/btree_gist/expected/char_optimizer.out index 2b312df8b19..694a197be22 100644 --- a/contrib/btree_gist/expected/char_optimizer.out +++ b/contrib/btree_gist/expected/char_optimizer.out @@ -74,7 +74,7 @@ SELECT * FROM chartmp WHERE a BETWEEN '31a' AND '31c'; -> Index Scan using charidx on chartmp Index Cond: ((a >= '31a'::bpchar) AND (a <= '31c'::bpchar)) Filter: ((a >= '31a'::bpchar) AND (a <= '31c'::bpchar)) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (5 rows) SELECT * FROM chartmp WHERE a BETWEEN '31a' AND '31c'; diff --git a/contrib/btree_gist/expected/date_optimizer.out b/contrib/btree_gist/expected/date_optimizer.out index c634639a4bb..a77041f847f 100644 --- a/contrib/btree_gist/expected/date_optimizer.out +++ b/contrib/btree_gist/expected/date_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '2001-02-13' FROM datetmp ORDER BY a <-> '2001-02-13' LIMIT 3; -> Sort Sort Key: ((a <-> '02-13-2001'::date)) -> Seq Scan on datetmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '2001-02-13' FROM datetmp ORDER BY a <-> '2001-02-13' LIMIT 3; diff --git a/contrib/btree_gist/expected/enum_optimizer.out b/contrib/btree_gist/expected/enum_optimizer.out index 4f57069c301..118676d163a 100644 --- a/contrib/btree_gist/expected/enum_optimizer.out +++ b/contrib/btree_gist/expected/enum_optimizer.out @@ -87,6 +87,6 @@ SELECT count(*) FROM enumtmp WHERE a >= 'g'::rainbow; -> Index Scan using enumidx on enumtmp Index Cond: (a >= 'g'::rainbow) Filter: (a >= 'g'::rainbow) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) diff --git a/contrib/btree_gist/expected/float4_optimizer.out b/contrib/btree_gist/expected/float4_optimizer.out index 94e24a790b4..cc40e9bd1ae 100644 --- a/contrib/btree_gist/expected/float4_optimizer.out +++ b/contrib/btree_gist/expected/float4_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '-179.0' FROM float4tmp ORDER BY a <-> '-179.0' LIMIT 3; -> Sort Sort Key: ((a <-> '-179'::real)) -> Seq Scan on float4tmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '-179.0' FROM float4tmp ORDER BY a <-> '-179.0' LIMIT 3; diff --git a/contrib/btree_gist/expected/float8_optimizer.out b/contrib/btree_gist/expected/float8_optimizer.out index 0a4e25a4af4..1bd96c44d3b 100644 --- a/contrib/btree_gist/expected/float8_optimizer.out +++ b/contrib/btree_gist/expected/float8_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '-1890.0' FROM float8tmp ORDER BY a <-> '-1890.0' LIMIT 3; -> Sort Sort Key: ((a <-> '-1890'::double precision)) -> Seq Scan on float8tmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '-1890.0' FROM float8tmp ORDER BY a <-> '-1890.0' LIMIT 3; diff --git a/contrib/btree_gist/expected/inet_optimizer.out b/contrib/btree_gist/expected/inet_optimizer.out index c694bae65e4..e6bff65a2bf 100644 --- a/contrib/btree_gist/expected/inet_optimizer.out +++ b/contrib/btree_gist/expected/inet_optimizer.out @@ -75,7 +75,7 @@ SELECT count(*) FROM inettmp WHERE a = '89.225.196.191'::inet; -> Index Scan using inetidx on inettmp Index Cond: (a = '89.225.196.191'::inet) Filter: (a = '89.225.196.191'::inet) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) SELECT count(*) FROM inettmp WHERE a = '89.225.196.191'::inet; @@ -96,7 +96,7 @@ SELECT count(*) FROM inettmp WHERE a = '89.225.196.191'::inet; -> Index Scan using inettmp_a_a1_idx on inettmp Index Cond: (a = '89.225.196.191'::inet) Filter: (a = '89.225.196.191'::inet) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) SELECT count(*) FROM inettmp WHERE a = '89.225.196.191'::inet; diff --git a/contrib/btree_gist/expected/int2_optimizer.out b/contrib/btree_gist/expected/int2_optimizer.out index b659ba51578..fdfc859097b 100644 --- a/contrib/btree_gist/expected/int2_optimizer.out +++ b/contrib/btree_gist/expected/int2_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '237' FROM int2tmp ORDER BY a <-> '237' LIMIT 3; -> Sort Sort Key: ((a <-> '237'::smallint)) -> Seq Scan on int2tmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '237' FROM int2tmp ORDER BY a <-> '237' LIMIT 3; diff --git a/contrib/btree_gist/expected/int4_optimizer.out b/contrib/btree_gist/expected/int4_optimizer.out index ab22e4b0c14..67107e63bfa 100644 --- a/contrib/btree_gist/expected/int4_optimizer.out +++ b/contrib/btree_gist/expected/int4_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '237' FROM int4tmp ORDER BY a <-> '237' LIMIT 3; -> Sort Sort Key: ((a <-> 237)) -> Seq Scan on int4tmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '237' FROM int4tmp ORDER BY a <-> '237' LIMIT 3; diff --git a/contrib/btree_gist/expected/int8_optimizer.out b/contrib/btree_gist/expected/int8_optimizer.out index 1dad0688e22..ba8e21135e8 100644 --- a/contrib/btree_gist/expected/int8_optimizer.out +++ b/contrib/btree_gist/expected/int8_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '464571291354841' FROM int8tmp ORDER BY a <-> '464571291354841' -> Sort Sort Key: ((a <-> '464571291354841'::bigint)) -> Seq Scan on int8tmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '464571291354841' FROM int8tmp ORDER BY a <-> '464571291354841' LIMIT 3; diff --git a/contrib/btree_gist/expected/interval_optimizer.out b/contrib/btree_gist/expected/interval_optimizer.out index 6f80bbb2dae..f5afd17456b 100644 --- a/contrib/btree_gist/expected/interval_optimizer.out +++ b/contrib/btree_gist/expected/interval_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '199 days 21:21:23' FROM intervaltmp ORDER BY a <-> '199 days 21 -> Sort Sort Key: ((a <-> '@ 199 days 21 hours 21 mins 23 secs'::interval)) -> Seq Scan on intervaltmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '199 days 21:21:23' FROM intervaltmp ORDER BY a <-> '199 days 21:21:23' LIMIT 3; @@ -104,7 +104,7 @@ SELECT a, a <-> '199 days 21:21:23' FROM intervaltmp ORDER BY a <-> '199 days 21 -> Sort Sort Key: ((a <-> '@ 199 days 21 hours 21 mins 23 secs'::interval)) -> Seq Scan on intervaltmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '199 days 21:21:23' FROM intervaltmp ORDER BY a <-> '199 days 21:21:23' LIMIT 3; diff --git a/contrib/btree_gist/expected/macaddr8_optimizer.out b/contrib/btree_gist/expected/macaddr8_optimizer.out index 5b2ae8caf31..25a84f81383 100644 --- a/contrib/btree_gist/expected/macaddr8_optimizer.out +++ b/contrib/btree_gist/expected/macaddr8_optimizer.out @@ -74,7 +74,7 @@ SELECT * FROM macaddr8tmp WHERE a < '02:03:04:05:06:07'::macaddr8; -> Index Scan using macaddr8idx on macaddr8tmp Index Cond: (a < '02:03:04:ff:fe:05:06:07'::macaddr8) Filter: (a < '02:03:04:ff:fe:05:06:07'::macaddr8) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (5 rows) SELECT * FROM macaddr8tmp WHERE a < '02:03:04:05:06:07'::macaddr8; diff --git a/contrib/btree_gist/expected/macaddr_optimizer.out b/contrib/btree_gist/expected/macaddr_optimizer.out index b5dff8ac830..e5f4e0602f5 100644 --- a/contrib/btree_gist/expected/macaddr_optimizer.out +++ b/contrib/btree_gist/expected/macaddr_optimizer.out @@ -74,7 +74,7 @@ SELECT * FROM macaddrtmp WHERE a < '02:03:04:05:06:07'::macaddr; -> Index Scan using macaddridx on macaddrtmp Index Cond: (a < '02:03:04:05:06:07'::macaddr) Filter: (a < '02:03:04:05:06:07'::macaddr) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (5 rows) SELECT * FROM macaddrtmp WHERE a < '02:03:04:05:06:07'::macaddr; diff --git a/contrib/btree_gist/expected/not_equal_optimizer.out b/contrib/btree_gist/expected/not_equal_optimizer.out index 579f92f0982..e99a1155ea0 100644 --- a/contrib/btree_gist/expected/not_equal_optimizer.out +++ b/contrib/btree_gist/expected/not_equal_optimizer.out @@ -16,7 +16,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM test_ne WHERE a <> '2009-01-01' AND b <> 10.7; Gather Motion 3:1 (slice1; segments: 3) -> Seq Scan on test_ne Filter: ((a <> 'Thu Jan 01 00:00:00 2009'::timestamp without time zone) AND (b <> 10.7)) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (4 rows) SELECT * FROM test_ne WHERE a <> '2009-01-01' AND b <> 10.7; diff --git a/contrib/btree_gist/expected/numeric_optimizer.out b/contrib/btree_gist/expected/numeric_optimizer.out index 3edf30fee0a..41c4996c06f 100644 --- a/contrib/btree_gist/expected/numeric_optimizer.out +++ b/contrib/btree_gist/expected/numeric_optimizer.out @@ -199,7 +199,7 @@ SELECT * FROM numerictmp WHERE a BETWEEN 1 AND 300 ORDER BY a; -> Index Scan using numericidx on numerictmp Index Cond: ((a >= '1'::numeric) AND (a <= '300'::numeric)) Filter: ((a >= '1'::numeric) AND (a <= '300'::numeric)) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (8 rows) SELECT * FROM numerictmp WHERE a BETWEEN 1 AND 300 ORDER BY a; diff --git a/contrib/btree_gist/expected/text_optimizer.out b/contrib/btree_gist/expected/text_optimizer.out index 9651b9d231d..76a2c86337b 100644 --- a/contrib/btree_gist/expected/text_optimizer.out +++ b/contrib/btree_gist/expected/text_optimizer.out @@ -81,7 +81,7 @@ SELECT * FROM texttmp WHERE a BETWEEN '31a' AND '31c'; -> Index Scan using textidx on texttmp Index Cond: ((a >= '31a'::text) AND (a <= '31c'::text)) Filter: ((a >= '31a'::text) AND (a <= '31c'::text)) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (5 rows) SELECT * FROM texttmp WHERE a BETWEEN '31a' AND '31c'; diff --git a/contrib/btree_gist/expected/time_optimizer.out b/contrib/btree_gist/expected/time_optimizer.out index 2005e89bf9f..590ada880b9 100644 --- a/contrib/btree_gist/expected/time_optimizer.out +++ b/contrib/btree_gist/expected/time_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '10:57:11' FROM timetmp ORDER BY a <-> '10:57:11' LIMIT 3; -> Sort Sort Key: ((a <-> '10:57:11'::time without time zone)) -> Seq Scan on timetmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '10:57:11' FROM timetmp ORDER BY a <-> '10:57:11' LIMIT 3; diff --git a/contrib/btree_gist/expected/timestamp_optimizer.out b/contrib/btree_gist/expected/timestamp_optimizer.out index 296bc3e8ac8..1b8e709fe90 100644 --- a/contrib/btree_gist/expected/timestamp_optimizer.out +++ b/contrib/btree_gist/expected/timestamp_optimizer.out @@ -82,7 +82,7 @@ SELECT a, a <-> '2004-10-26 08:55:08' FROM timestamptmp ORDER BY a <-> '2004-10- -> Sort Sort Key: ((a <-> 'Tue Oct 26 08:55:08 2004'::timestamp without time zone)) -> Seq Scan on timestamptmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '2004-10-26 08:55:08' FROM timestamptmp ORDER BY a <-> '2004-10-26 08:55:08' LIMIT 3; diff --git a/contrib/btree_gist/expected/timestamptz_optimizer.out b/contrib/btree_gist/expected/timestamptz_optimizer.out index e2cd3dc89cb..2173c5dca35 100644 --- a/contrib/btree_gist/expected/timestamptz_optimizer.out +++ b/contrib/btree_gist/expected/timestamptz_optimizer.out @@ -202,7 +202,7 @@ SELECT a, a <-> '2018-12-18 10:59:54 GMT+2' FROM timestamptztmp ORDER BY a <-> ' -> Sort Sort Key: ((a <-> 'Tue Dec 18 04:59:54 2018 PST'::timestamp with time zone)) -> Seq Scan on timestamptztmp - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) SELECT a, a <-> '2018-12-18 10:59:54 GMT+2' FROM timestamptztmp ORDER BY a <-> '2018-12-18 10:59:54 GMT+2' LIMIT 3; diff --git a/contrib/btree_gist/expected/varbit_optimizer.out b/contrib/btree_gist/expected/varbit_optimizer.out index f73f5688ab1..98727abe44c 100644 --- a/contrib/btree_gist/expected/varbit_optimizer.out +++ b/contrib/btree_gist/expected/varbit_optimizer.out @@ -74,6 +74,6 @@ SELECT a FROM bittmp WHERE a BETWEEN '1000000' and '1000001'; -> Index Scan using bitidx on bittmp Index Cond: ((a >= '1000000'::"bit") AND (a <= '1000001'::"bit")) Filter: ((a >= '1000000'::"bit") AND (a <= '1000001'::"bit")) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (5 rows) diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c index ea990020baf..af82922522b 100644 --- a/contrib/dblink/dblink.c +++ b/contrib/dblink/dblink.c @@ -198,7 +198,7 @@ dblink_get_conn(char *conname_or_str, connstr = get_connect_string(conname_or_str); if (connstr == NULL) connstr = conname_or_str; - dblink_connstr_check(connstr); + connstr = dblink_connstr_check(connstr); /* * We must obey fd.c's limit on non-virtual file descriptors. Assume @@ -312,7 +312,6 @@ dblink_connect(PG_FUNCTION_ARGS) /* check password in connection string if not superuser */ connstr = dblink_connstr_check(connstr); - dblink_connstr_check(connstr); /* * We must obey fd.c's limit on non-virtual file descriptors. Assume that diff --git a/contrib/interconnect/ic_common.c b/contrib/interconnect/ic_common.c index d629eb09698..7e266b69efb 100644 --- a/contrib/interconnect/ic_common.c +++ b/contrib/interconnect/ic_common.c @@ -541,14 +541,15 @@ GetMotionSentRecordTypmod(ChunkTransportState * transportStates, int16 motNodeID, int16 targetRoute) { - MotionConn *conn; + MotionConn *conn = NULL; ChunkTransportStateEntry *pEntry = NULL; getChunkTransportState(transportStates, motNodeID, &pEntry); - if (targetRoute == BROADCAST_SEGIDX) - conn = &pEntry->conns[0]; - else - conn = &pEntry->conns[targetRoute]; + if (targetRoute == BROADCAST_SEGIDX) { + targetRoute = 0; + } + + getMotionConn(pEntry, targetRoute, &conn); return &conn->sent_record_typmod; } diff --git a/contrib/interconnect/ic_internal.h b/contrib/interconnect/ic_internal.h index 77c08ee6e29..8557115cd33 100644 --- a/contrib/interconnect/ic_internal.h +++ b/contrib/interconnect/ic_internal.h @@ -10,6 +10,11 @@ */ #ifndef INTER_CONNECT_INTERNAL_H #define INTER_CONNECT_INTERNAL_H +#include +#include +#include +#include +#include #include "tcp/ic_tcp.h" #include "udp/ic_udpifc.h" @@ -33,6 +38,27 @@ typedef enum MotionConnState mcsEosSent } MotionConnState; +struct udp_send_vars +{ + /* send sequence variables */ + uint32_t snd_una; /* send unacknoledged */ + uint32_t snd_wnd; /* send window (unscaled) */ + + /* retransmission timeout variables */ + uint8_t nrtx; /* number of retransmission */ + uint8_t max_nrtx; /* max number of retransmission */ + uint32_t rto; /* retransmission timeout */ + uint32_t ts_rto; /* timestamp for retransmission timeout */ + + /* congestion control variables */ + uint32_t cwnd; /* congestion window */ + uint32_t ssthresh; /* slow start threshold */ + + TAILQ_ENTRY(MotionConnUDP) send_link; + TAILQ_ENTRY(MotionConnUDP) timer_link; /* timer link (rto list) */ + +}; + /* * Structure used for keeping track of a pt-to-pt connection between two * Cdb Entities (either QE or QD). @@ -153,6 +179,32 @@ typedef struct MotionConnUDP uint64 stat_count_resent; uint64 stat_max_resent; uint64 stat_count_dropped; + + struct { + uint32_t ts_rto; + uint32_t rto; + uint32_t srtt; + uint32_t rttvar; + uint32_t snd_una; + uint16_t nrtx; + uint16_t max_nrtx; + uint32_t mss; + uint32_t cwnd; + uint32_t ssthresh; + uint32_t fss; + uint8_t loss_count; + uint32_t mdev; + uint32_t mdev_max; + uint32_t rtt_seq; /* sequence number to update rttvar */ + uint32_t ts_all_rto; + bool karn_mode; + } rttvar; + + uint8_t on_timewait_list; + int16_t on_rto_idx; + + uint32_t snd_nxt; /* send next */ + struct udp_send_vars sndvar; } MotionConnUDP; typedef struct MotionConnTCP diff --git a/contrib/interconnect/test/ic_test_env.c b/contrib/interconnect/test/ic_test_env.c index 5333a143de5..1c9f2d0ce05 100644 --- a/contrib/interconnect/test/ic_test_env.c +++ b/contrib/interconnect/test/ic_test_env.c @@ -330,6 +330,7 @@ client_side_global_var_init(MotionIPCLayer * motion_ipc_layer, pid_t *ic_proxy_p Gp_interconnect_queue_depth = 800; Gp_interconnect_snd_queue_depth = 600; + Gp_interconnect_mem_size = 20; Gp_interconnect_timer_period = 1; Gp_interconnect_timer_checking_period = 2; InitializeLatchSupport(); @@ -374,6 +375,7 @@ server_side_global_var_init(MotionIPCLayer * motion_ipc_layer, pid_t *ic_proxy_p Gp_interconnect_queue_depth = 800; Gp_interconnect_snd_queue_depth = 600; + Gp_interconnect_mem_size = 20; Gp_interconnect_timer_period = 1; Gp_interconnect_timer_checking_period = 2; InitializeLatchSupport(); diff --git a/contrib/interconnect/udp/ic_udpifc.c b/contrib/interconnect/udp/ic_udpifc.c index 63e8c9301dd..d11e4577cd6 100644 --- a/contrib/interconnect/udp/ic_udpifc.c +++ b/contrib/interconnect/udp/ic_udpifc.c @@ -26,13 +26,17 @@ #include "ic_udpifc.h" #include "ic_internal.h" #include "ic_common.h" - #include #include #include #include #include #include +#include +#include +#include +#include +#include #include "access/transam.h" #include "access/xact.h" @@ -116,6 +120,57 @@ WSAPoll( #undef select #endif +#define TIMEOUT_Z +#define RTT_SHIFT_ALPHA (3) /* srtt (0.125) */ +#define LOSS_THRESH (3) /* Packet loss triggers Karn */ +#define RTO_MIN (5000) /* MIN RTO(ms) */ +#define RTO_MAX (100000) /* MAX RTO(ms) */ +#define UDP_INFINITE_SSTHRESH 0x7fffffff + +#define SEC_TO_USEC(t) ((t) * 1000000) +#define SEC_TO_MSEC(t) ((t) * 1000) +#define MSEC_TO_USEC(t) ((t) * 1000) +#define USEC_TO_SEC(t) ((t) / 1000000) +#define TIME_TICK (1000000/HZ)/* in us */ + +#define UDP_INITIAL_RTO (MSEC_TO_USEC(200)) +#define UDP_DEFAULT_MSS 1460 + +#define RTO_HASH (3000) + +#define UDP_SEQ_LT(a,b) ((int32_t)((a)-(b)) < 0) +#define UDP_SEQ_LEQ(a,b) ((int32_t)((a)-(b)) <= 0) +#define UDP_SEQ_GT(a,b) ((int32_t)((a)-(b)) > 0) +#define UDP_SEQ_GEQ(a,b) ((int32_t)((a)-(b)) >= 0) + +#ifndef MAX +#define MAX(a, b) ((a)>(b)?(a):(b)) +#endif +#ifndef MIN +#define MIN(a, b) ((a)<(b)?(a):(b)) +#endif + +#define UDP_RTO_MIN ((unsigned)(HZ/5)) + +struct rto_hashstore +{ + uint32_t rto_now_idx; /* pointing the hs_table_s index */ + uint32_t rto_now_ts; + + TAILQ_HEAD(rto_head, MotionConnUDP) rto_list[RTO_HASH + 1]; +}; + +struct mudp_manager +{ + struct rto_hashstore *rto_store; /* lists related to timeout */ + + int rto_list_cnt; + uint32_t cur_ts; +}; + +typedef struct mudp_manager* mudp_manager_t; +static struct mudp_manager mudp; + #define MAX_TRY (11) int timeoutArray[] = @@ -154,6 +209,7 @@ int #define UDPIC_FLAGS_DISORDER (32) #define UDPIC_FLAGS_DUPLICATE (64) #define UDPIC_FLAGS_CAPACITY (128) +#define UDPIC_FLAGS_FULL (256) /* * ConnHtabBin @@ -516,8 +572,10 @@ static ICGlobalControlInfo ic_control_info; */ #define UNACK_QUEUE_RING_SLOTS_NUM (2000) #define TIMER_SPAN (Gp_interconnect_timer_period * 1000ULL) /* default: 5ms */ -#define TIMER_CHECKING_PERIOD (Gp_interconnect_timer_checking_period) /* default: 20ms */ +#define TIMER_SPAN_LOSS (Gp_interconnect_timer_period * 500ULL) /* default: 5ms */ +#define TIMER_CHECKING_PERIOD Gp_interconnect_timer_checking_period /* default: 20ms */ #define UNACK_QUEUE_RING_LENGTH (UNACK_QUEUE_RING_SLOTS_NUM * TIMER_SPAN) +#define UNACK_QUEUE_RING_LENGTH_LOSS (UNACK_QUEUE_RING_SLOTS_NUM * TIMER_SPAN_LOSS) #define DEFAULT_RTT (Gp_interconnect_default_rtt * 1000) /* default: 20ms */ #define MIN_RTT (100) /* 0.1ms */ @@ -537,6 +595,7 @@ static ICGlobalControlInfo ic_control_info; #define MAX_SEQS_IN_DISORDER_ACK (4) +#define MAX_QUEUE_SIZE (64) /* * UnackQueueRing * @@ -573,12 +632,19 @@ struct UnackQueueRing /* time slots */ ICBufferList slots[UNACK_QUEUE_RING_SLOTS_NUM]; +#ifdef TIMEOUT_Z + uint32_t retrans_count; + uint32_t no_retrans_count; + uint32_t time_difference; + uint32_t min; + uint32_t max; +#endif }; /* * All connections in a process share this unack queue ring instance. */ -static UnackQueueRing unack_queue_ring = {0, 0, 0}; +static UnackQueueRing unack_queue_ring = {0}; static int ICSenderSocket = -1; static int32 ICSenderPort = 0; @@ -746,8 +812,8 @@ static void checkQDConnectionAlive(void); static void *rxThreadFunc(void *arg); static bool handleMismatch(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len); -static void handleAckedPacket(MotionConn *ackConn, ICBuffer *buf, uint64 now); -static bool handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry); +static void handleAckedPacket(MotionConn *ackConn, ICBuffer *buf, uint64 now, struct icpkthdr *pkt); +static bool handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, bool need_flush); static void handleStopMsgs(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, int16 motionId); static void handleDisorderPacket(MotionConn *conn, int pos, uint32 tailSeq, icpkthdr *pkt); static bool handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, socklen_t *peerlen, AckSendParam *param, bool *wakeup_mainthread); @@ -766,9 +832,11 @@ static void initSndBufferPool(); static void putIntoUnackQueueRing(UnackQueueRing *uqr, ICBuffer *buf, uint64 expTime, uint64 now); static void initUnackQueueRing(UnackQueueRing *uqr); +static void initUdpManager(mudp_manager_t mptr); +static inline void checkNetworkTimeout(ICBuffer *buf, uint64 now, bool *networkTimeoutIsLogged); static void checkExpiration(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *triggerConn, uint64 now); -static void checkDeadlock(ChunkTransportStateEntry *pChunkEntry, MotionConn *conn); +static void checkDeadlock(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, MotionConn *conn); static bool cacheFuturePacket(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len); static void cleanupStartupCache(void); @@ -924,6 +992,349 @@ dumpTransProtoStats() #endif /* TRANSFER_PROTOCOL_STATS */ +static struct rto_hashstore* +initRTOHashstore() +{ + int i; + struct rto_hashstore* hs = palloc(sizeof(struct rto_hashstore)); + + for (i = 0; i < RTO_HASH; i++) + TAILQ_INIT(&hs->rto_list[i]); + + TAILQ_INIT(&hs->rto_list[RTO_HASH]); + + return hs; +} + +static void +initUdpManager(mudp_manager_t mudp) +{ + mudp->rto_store = initRTOHashstore(); + mudp->rto_list_cnt = 0; + mudp->cur_ts = 0; +} + +static inline void +addtoRTOList(mudp_manager_t mudp, MotionConnUDP *cur_stream) +{ + if (!mudp->rto_list_cnt) + { + mudp->rto_store->rto_now_idx = 0; + mudp->rto_store->rto_now_ts = cur_stream->sndvar.ts_rto; + } + + if (cur_stream->on_rto_idx < 0 ) + { + if (cur_stream->on_timewait_list) + return; + + int diff = (int32_t)(cur_stream->sndvar.ts_rto - mudp->rto_store->rto_now_ts); + if (diff < RTO_HASH) + { + int offset= (diff + mudp->rto_store->rto_now_idx) % RTO_HASH; + cur_stream->on_rto_idx = offset; + TAILQ_INSERT_TAIL(&(mudp->rto_store->rto_list[offset]), + cur_stream, sndvar.timer_link); + } + else + { + cur_stream->on_rto_idx = RTO_HASH; + TAILQ_INSERT_TAIL(&(mudp->rto_store->rto_list[RTO_HASH]), + cur_stream, sndvar.timer_link); + } + mudp->rto_list_cnt++; + } +} + +static inline void +removeFromRTOList(mudp_manager_t mudp, + MotionConnUDP *cur_stream) +{ + if (cur_stream->on_rto_idx < 0) + return; + + TAILQ_REMOVE(&mudp->rto_store->rto_list[cur_stream->on_rto_idx], + cur_stream, sndvar.timer_link); + cur_stream->on_rto_idx = -1; + + mudp->rto_list_cnt--; +} + +static inline void +updateRetransmissionTimer(mudp_manager_t mudp, + MotionConnUDP *cur_stream, + uint32_t cur_ts) +{ + cur_stream->sndvar.nrtx = 0; + + /* if in rto list, remove it */ + if (cur_stream->on_rto_idx >= 0) + removeFromRTOList(mudp, cur_stream); + + /* Reset retransmission timeout */ + if (UDP_SEQ_GT(cur_stream->snd_nxt, cur_stream->sndvar.snd_una)) + { + /* there are packets sent but not acked */ + /* update rto timestamp */ + cur_stream->sndvar.ts_rto = cur_ts + cur_stream->sndvar.rto; + addtoRTOList(mudp, cur_stream); + } + + if (cur_stream->on_rto_idx == -1) + { + cur_stream->sndvar.ts_rto = cur_ts + cur_stream->sndvar.rto; + addtoRTOList(mudp, cur_stream); + } +} + +static int +handleRTO(mudp_manager_t mudp, + uint32_t cur_ts, + MotionConnUDP *cur_stream, + ChunkTransportState *transportStates, + ChunkTransportStateEntry *pEntry, + MotionConn *triggerConn) +{ + /* check for expiration */ + int count = 0; + int retransmits = 0; + MotionConnUDP *currBuffConn = NULL; + uint32_t now = cur_ts; + + Assert(unack_queue_ring.currentTime != 0); + removeFromRTOList(mudp, cur_stream); + + while (now >= (unack_queue_ring.currentTime + TIMER_SPAN) && count++ < UNACK_QUEUE_RING_SLOTS_NUM) + { + /* expired, need to resend them */ + ICBuffer *curBuf = NULL; + + while ((curBuf = icBufferListPop(&unack_queue_ring.slots[unack_queue_ring.idx])) != NULL) + { + curBuf->nRetry++; + putIntoUnackQueueRing( + &unack_queue_ring, + curBuf, + computeExpirationPeriod(curBuf->conn, curBuf->nRetry), now); + +#ifdef TRANSFER_PROTOCOL_STATS + updateStats(TPE_DATA_PKT_SEND, curBuf->conn, curBuf->pkt); +#endif + + sendOnce(transportStates, pEntry, curBuf, curBuf->conn); + + currBuffConn = CONTAINER_OF(curBuf->conn, MotionConnUDP, mConn); + + retransmits++; + ic_statistics.retransmits++; + currBuffConn->stat_count_resent++; + currBuffConn->stat_max_resent = Max(currBuffConn->stat_max_resent, currBuffConn->stat_count_resent); + checkNetworkTimeout(curBuf, now, &transportStates->networkTimeoutIsLogged); + +#ifdef AMS_VERBOSE_LOGGING + write_log("RESEND pkt with seq %d (retry %d, rtt " UINT64_FORMAT ") to route %d", + curBuf->pkt->seq, curBuf->nRetry, curBuf->conn->rtt, curBuf->conn->route); + logPkt("RESEND PKT in checkExpiration", curBuf->pkt); +#endif + } + + unack_queue_ring.currentTime += TIMER_SPAN; + unack_queue_ring.idx = (unack_queue_ring.idx + 1) % (UNACK_QUEUE_RING_SLOTS_NUM); + } + return 0; +} + +static inline void +rearrangeRTOStore(mudp_manager_t mudp) +{ + MotionConnUDP *walk, *next; + struct rto_head* rto_list = &mudp->rto_store->rto_list[RTO_HASH]; + int cnt = 0; + + for (walk = TAILQ_FIRST(rto_list); walk != NULL; walk = next) + { + next = TAILQ_NEXT(walk, sndvar.timer_link); + + int diff = (int32_t)(mudp->rto_store->rto_now_ts - walk->sndvar.ts_rto); + if (diff < RTO_HASH) + { + int offset = (diff + mudp->rto_store->rto_now_idx) % RTO_HASH; + TAILQ_REMOVE(&mudp->rto_store->rto_list[RTO_HASH], + walk, sndvar.timer_link); + walk->on_rto_idx = offset; + TAILQ_INSERT_TAIL(&(mudp->rto_store->rto_list[offset]), + walk, sndvar.timer_link); + } + cnt++; + } +} + +static inline void +checkRtmTimeout(mudp_manager_t mudp, + uint32_t cur_ts, + int thresh, + ChunkTransportState *transportStates, + ChunkTransportStateEntry *pEntry, + MotionConn *triggerConn) +{ + MotionConnUDP *walk, *next; + struct rto_head* rto_list; + int cnt; + + if (!mudp->rto_list_cnt) + return; + + cnt = 0; + + while (1) + { + rto_list = &mudp->rto_store->rto_list[mudp->rto_store->rto_now_idx]; + if ((int32_t)(cur_ts - mudp->rto_store->rto_now_ts) < 0) + break; + + for (walk = TAILQ_FIRST(rto_list); walk != NULL; walk = next) + { + if (++cnt > thresh) + break; + next = TAILQ_NEXT(walk, sndvar.timer_link); + + if (walk->on_rto_idx >= 0) + { + TAILQ_REMOVE(rto_list, walk, sndvar.timer_link); + mudp->rto_list_cnt--; + walk->on_rto_idx = -1; + handleRTO(mudp, cur_ts, walk, transportStates, pEntry, triggerConn); + } + } + + if (cnt > thresh) + { + break; + } + else + { + mudp->rto_store->rto_now_idx = (mudp->rto_store->rto_now_idx + 1) % RTO_HASH; + mudp->rto_store->rto_now_ts++; + if (!(mudp->rto_store->rto_now_idx % 1000)) + rearrangeRTOStore(mudp); + } + + } +} + +/* + * estimateRTT - Dynamically estimates the Round-Trip Time (RTT) and adjusts Retransmission Timeout (RTO) + * + * This function implements a variant of the Jacobson/Karels algorithm for RTT estimation, adapted for UDP-based + * motion control connections. It updates smoothed RTT (srtt), mean deviation (mdev), and RTO values based on + * newly measured RTT samples (mrtt). The RTO calculation ensures reliable data transmission over unreliable networks. + * + * Key Components: + * - srtt: Smoothed Round-Trip Time (weighted average of historical RTT samples) + * - mdev: Mean Deviation (measure of RTT variability) + * - rttvar: Adaptive RTT variation bound (used to clamp RTO updates) + * - rto: Retransmission Timeout (dynamically adjusted based on srtt + rttvar) + * + * Algorithm Details: + * 1. For the first RTT sample: + * srtt = mrtt << 3 (scaled by 8 for fixed-point arithmetic) + * mdev = mrtt << 1 (scaled by 2) + * rttvar = max(mdev, rto_min) + * 2. For subsequent samples: + * Delta = mrtt - (srtt >> 3) (difference between new sample and smoothed RTT) + * srtt += Delta (update srtt with 1/8 weight of new sample) + * Delta = abs(Delta) - (mdev >> 2) + * mdev += Delta (update mdev with 1/4 weight) + * 3. rttvar bounds the maximum RTT variation: + * If mdev > mdev_max, update mdev_max and rttvar + * On new ACKs (snd_una > rtt_seq), decay rttvar toward mdev_max + * 4. Final RTO calculation: + * rto = (srtt >> 3) + rttvar (clamped to RTO_MAX) + * + * Parameters: + * @mConn: Parent motion connection context (container of MotionConnUDP) + * @mrtt: Measured Round-Trip Time (in microseconds) for the latest packet + * + * Notes: + * - Designed for non-retransmitted packets to avoid sampling bias. + * - Uses fixed-point arithmetic to avoid floating-point operations. + * - Minimum RTO (rto_min) is set to 20ms (HZ/5/10, assuming HZ=100). + * - Critical for adaptive timeout control in UDP protocols where reliability is implemented at the application layer. + * - Thread-unsafe: Must be called in a synchronized context (e.g., packet processing loop). + */ + +static inline void +estimateRTT(MotionConn *mConn , uint32_t mrtt) +{ + /* This function should be called for not retransmitted packets */ + /* TODO: determine rto_min */ + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + long m = mrtt; + uint32_t rto_min = UDP_RTO_MIN / 10; + + if (m == 0) + m = 1; + + /* + * Special RTO optimization for high-speed networks: + * When measured RTT (m) is below 100 microseconds and current RTO is under 10ms, + * forcibly set RTO to half of RTO_MIN. This targets two scenarios: + * - Loopback interfaces (localhost communication) + * - Ultra-low-latency networks (e.g., InfiniBand, RDMA) + */ + if(m < 100 && conn->rttvar.rto < 10000) + { + conn->rttvar.rto = RTO_MIN / 2; + } + + if (conn->rttvar.srtt != 0) + { + /* rtt = 7/8 rtt + 1/8 new */ + m -= (conn->rttvar.srtt >> LOSS_THRESH); + conn->rttvar.srtt += m; + if (m < 0) + { + m = -m; + m -= (conn->rttvar.mdev >> RTT_SHIFT_ALPHA); + if (m > 0) + m >>= LOSS_THRESH; + } + else + { + m -= (conn->rttvar.mdev >> RTT_SHIFT_ALPHA); + } + conn->rttvar.mdev += m; + if (conn->rttvar.mdev > conn->rttvar.mdev_max) + { + conn->rttvar.mdev_max = conn->rttvar.mdev; + if (conn->rttvar.mdev_max > conn->rttvar.rttvar) + { + conn->rttvar.rttvar = conn->rttvar.mdev_max; + } + } + if (UDP_SEQ_GT(conn->rttvar.snd_una, conn->rttvar.rtt_seq)) + { + if (conn->rttvar.mdev_max < conn->rttvar.rttvar) + { + conn->rttvar.rttvar -= (conn->rttvar.rttvar - conn->rttvar.mdev_max) >> RTT_SHIFT_ALPHA; + } + conn->rttvar.mdev_max = rto_min; + } + } + else + { + /* fresh measurement */ + conn->rttvar.srtt = m << LOSS_THRESH; + conn->rttvar.mdev = m << 1; + conn->rttvar.mdev_max = conn->rttvar.rttvar = MAX(conn->rttvar.mdev, rto_min); + } + + conn->rttvar.rto = ((conn->rttvar.srtt >> LOSS_THRESH) + conn->rttvar.rttvar) > RTO_MAX ? RTO_MAX : ((conn->rttvar.srtt >> LOSS_THRESH) + conn->rttvar.rttvar); +} + + /* * initCursorICHistoryTable * Initialize cursor ic history table. @@ -2522,6 +2933,14 @@ initUnackQueueRing(UnackQueueRing *uqr) { icBufferListInit(&uqr->slots[i], ICBufferListType_Secondary); } + +#ifdef TIMEOUT_Z + uqr->retrans_count = 0; + uqr->no_retrans_count = 0; + uqr->time_difference = 0; + uqr->min = 0; + uqr->max = 0; +#endif } /* @@ -2556,6 +2975,9 @@ computeExpirationPeriod(MotionConn *mConn, uint32 retry) else #endif { + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) + return Min(retry > 3 ? conn->rttvar.rto * retry : conn->rttvar.rto, UNACK_QUEUE_RING_LENGTH_LOSS); + uint32 factor = (retry <= 12 ? retry : 12); return Max(MIN_EXPIRATION_PERIOD, Min(MAX_EXPIRATION_PERIOD, (conn->rtt + (conn->dev << 2)) << (factor))); @@ -2968,6 +3390,19 @@ setupOutgoingUDPConnection(ChunkTransportState *transportStates, ChunkTransportS conn->mConn.msgSize = sizeof(conn->conn_info); conn->mConn.stillActive = true; conn->conn_info.seq = 1; + conn->rttvar.ts_rto = 0; + conn->rttvar.rto = UDP_INITIAL_RTO; + conn->rttvar.srtt = 0; + conn->rttvar.rttvar = 0; + conn->rttvar.snd_una = 0; + conn->rttvar.nrtx = 0; + conn->rttvar.max_nrtx = 0; + conn->rttvar.mss = UDP_DEFAULT_MSS; + conn->rttvar.cwnd = 2; + conn->rttvar.ssthresh = UDP_INFINITE_SSTHRESH; + conn->rttvar.loss_count = 0; + conn->rttvar.karn_mode = false; + conn->on_rto_idx = -1; Assert(conn->peer.ss_family == AF_INET || conn->peer.ss_family == AF_INET6); } /* setupOutgoingUDPConnection */ @@ -3207,6 +3642,19 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) conn->conn_info.icId = sliceTable->ic_instance_id; conn->conn_info.flags = UDPIC_FLAGS_RECEIVER_TO_SENDER; + conn->rttvar.ts_rto = 0; + conn->rttvar.rto = UDP_INITIAL_RTO; + conn->rttvar.srtt = 0; + conn->rttvar.rttvar = 0; + conn->rttvar.snd_una = 0; + conn->rttvar.nrtx = 0; + conn->rttvar.max_nrtx = 0; + conn->rttvar.mss = UDP_DEFAULT_MSS; + conn->rttvar.cwnd = 2; + conn->rttvar.ssthresh = UDP_INFINITE_SSTHRESH; + conn->rttvar.loss_count = 0; + conn->rttvar.karn_mode = false; + conn->on_rto_idx = -1; connAddHash(&ic_control_info.connHtab, &conn->mConn); } } @@ -3221,6 +3669,8 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) { initSndBufferPool(&snd_buffer_pool); initUnackQueueRing(&unack_queue_ring); + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER) + initUdpManager(&mudp); ic_control_info.isSender = true; ic_control_info.lastExpirationCheckTime = getCurrentTime(); ic_control_info.lastPacketSendTime = ic_control_info.lastExpirationCheckTime; @@ -3284,6 +3734,9 @@ static inline void SetupUDPIFCInterconnect(EState *estate) { ChunkTransportState *icContext = NULL; + int32 sliceNum = 0; + int32 calcQueueDepth = 0; + int32 calcSndDepth = 0; PG_TRY(); { /* @@ -3291,6 +3744,39 @@ SetupUDPIFCInterconnect(EState *estate) * technically it is not part of current query, discard it directly. */ resetRxThreadError(); + if (estate != NULL && estate->es_sliceTable != NULL) + sliceNum = estate->es_sliceTable->numSlices; + else + sliceNum = 1; + + if (Gp_interconnect_mem_size > 0 && + Gp_interconnect_queue_depth == 4 && + Gp_interconnect_snd_queue_depth == 2) + { + int32 perQueue = Gp_interconnect_mem_size / + (Gp_max_packet_size * sliceNum); + + calcSndDepth = Max(Gp_interconnect_snd_queue_depth, perQueue / 2); + calcQueueDepth = Max(Gp_interconnect_queue_depth, perQueue - calcSndDepth); + + if (calcSndDepth > MAX_QUEUE_SIZE) + calcSndDepth = MAX_QUEUE_SIZE; + + if (calcQueueDepth > MAX_QUEUE_SIZE) + calcQueueDepth = MAX_QUEUE_SIZE; + + Gp_interconnect_snd_queue_depth = calcSndDepth; + Gp_interconnect_queue_depth = calcQueueDepth; + + elog(DEBUG1, "SetupUDPIFCInterconnect: queue depth, " + "queue_depth=%d, snd_queue_depth=%d, " + "mem_size=%d, slices=%d, packet_size=%d", + Gp_interconnect_queue_depth, + Gp_interconnect_snd_queue_depth, + Gp_interconnect_mem_size, + sliceNum, + Gp_max_packet_size); + } icContext = SetupUDPIFCInterconnect_Internal(estate->es_sliceTable); @@ -3815,7 +4301,6 @@ static TupleChunkListItem receiveChunksUDPIFC(ChunkTransportState *pTransportStates, ChunkTransportStateEntry *pEntry, int16 motNodeID, int16 *srcRoute, MotionConn *mConn) { - bool directed = false; int nFds = 0; int *waitFds = NULL; int nevent = 0; @@ -3832,7 +4317,6 @@ receiveChunksUDPIFC(ChunkTransportState *pTransportStates, ChunkTransportStateEn if (mConn != NULL) { conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); - directed = true; *srcRoute = conn->route; setMainThreadWaiting(&rx_control_info.mainWaitingState, motNodeID, conn->route, pTransportStates->sliceTable->ic_instance_id); @@ -3965,7 +4449,10 @@ receiveChunksUDPIFCLoop(ChunkTransportState *pTransportStates, ChunkTransportSta /* check the potential errors in rx thread. */ checkRxThreadError(); - /* do not check interrupts when holding the lock */ + FaultInjector_InjectFaultIfSet("interconnect_stop_recv_chunk", + DDLNotSpecified, + "" /* databaseName */ , + "" /* tableName */ ); ML_CHECK_FOR_INTERRUPTS(pTransportStates->teardownActive); /* @@ -4472,7 +4959,7 @@ logPkt(char *prefix, icpkthdr *pkt) * packet is retransmitted. */ static void -handleAckedPacket(MotionConn *ackMotionConn, ICBuffer *buf, uint64 now) +handleAckedPacket(MotionConn *ackMotionConn, ICBuffer *buf, uint64 now, struct icpkthdr *pkt) { uint64 ackTime = 0; bool bufIsHead = false; @@ -4485,6 +4972,39 @@ handleAckedPacket(MotionConn *ackMotionConn, ICBuffer *buf, uint64 now) buf = icBufferListDelete(&ackConn->unackQueue, buf); + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE || Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER) + { + bufConn = CONTAINER_OF(buf->conn, MotionConnUDP, mConn); + buf = icBufferListDelete(&unack_queue_ring.slots[buf->unackQueueRingSlot], buf); + unack_queue_ring.numOutStanding--; + if (icBufferListLength(&ackConn->unackQueue) >= 1) + unack_queue_ring.numSharedOutStanding--; + + ackTime = now - buf->sentTime; + + if (buf->nRetry == 0) + { + /* adjust the congestion control window. */ + if (snd_control_info.cwnd < snd_control_info.ssthresh) + snd_control_info.cwnd += 2; + else + snd_control_info.cwnd += 1 / snd_control_info.cwnd; + snd_control_info.cwnd = Min(snd_control_info.cwnd, snd_buffer_pool.maxCount); + } + + if ((bufConn->rttvar.rto << 1) > ackTime && pkt->retry_times != Gp_interconnect_min_retries_before_timeout) + estimateRTT(buf->conn, (now - pkt->send_time)); + + if (buf->nRetry && pkt->retry_times > 0 && pkt->retry_times < Gp_interconnect_min_retries_before_timeout) + bufConn->rttvar.rto += (bufConn->rttvar.rto >> 4 * buf->nRetry); + + if (unlikely(Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER)) + { + bufConn->sndvar.ts_rto = bufConn->rttvar.rto; + addtoRTOList(&mudp, bufConn); + } + } + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS) { buf = icBufferListDelete(&unack_queue_ring.slots[buf->unackQueueRingSlot], buf); @@ -4564,7 +5084,7 @@ handleAckedPacket(MotionConn *ackMotionConn, ICBuffer *buf, uint64 now) * if we receive a stop message, return true (caller will clean up). */ static bool -handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry) +handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, bool need_flush) { ChunkTransportStateEntryUDP * pEntry = NULL; bool ret = false; @@ -4577,7 +5097,6 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChun struct icpkthdr *pkt = snd_control_info.ackBuffer; - bool shouldSendBuffers = false; SliceTable *sliceTbl = transportStates->sliceTable; @@ -4702,6 +5221,12 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChun shouldSendBuffers |= (handleAckForDisorderPkt(transportStates, &pEntry->entry, &ackConn->mConn, pkt)); break; } + else if (pkt->flags & UDPIC_FLAGS_FULL) + { + if (DEBUG1 >= log_min_messages) + write_log("Recv buff is full [seq %d] from route %d; srcpid %d dstpid %d cmd %d flags 0x%x connseq %d", pkt->seq, ackConn->route, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->flags, ackConn->conn_info.seq); + break; + } /* * don't get out of the loop if pkt->seq equals to @@ -4751,7 +5276,7 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChun while (!icBufferListIsHead(&ackConn->unackQueue, link) && buf->pkt->seq <= pkt->seq) { next = link->next; - handleAckedPacket(&ackConn->mConn, buf, now); + handleAckedPacket(&ackConn->mConn, buf, now, pkt); shouldSendBuffers = true; link = next; buf = GET_ICBUFFER_FROM_PRIMARY(link); @@ -4767,7 +5292,7 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChun * still send here, since in STOP/EOS race case, we may have been * in EOS sending logic and will not check stop message. */ - if (shouldSendBuffers) + if (shouldSendBuffers && need_flush) sendBuffers(transportStates, &pEntry->entry, &ackConn->mConn); } else if (DEBUG1 >= log_min_messages) @@ -5011,7 +5536,7 @@ handleStopMsgs(ChunkTransportState *transportStates, ChunkTransportStateEntry *p { if (pollAcks(transportStates, pEntry->txfd, 0)) { - if (handleAcks(transportStates, &pEntry->entry)) + if (handleAcks(transportStates, &pEntry->entry, true)) { /* more stops found, loop again. */ i = 0; @@ -5053,7 +5578,7 @@ sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEnt { ICBuffer *buf = NULL; - if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS && + if ((Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS || Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) && (icBufferListLength(&conn->unackQueue) > 0 && unack_queue_ring.numSharedOutStanding >= (snd_control_info.cwnd - snd_control_info.minCwnd))) break; @@ -5074,7 +5599,7 @@ sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEnt icBufferListAppend(&conn->unackQueue, buf); - if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS) + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS || Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) { unack_queue_ring.numOutStanding++; if (icBufferListLength(&conn->unackQueue) > 1) @@ -5098,6 +5623,10 @@ sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEnt updateStats(TPE_DATA_PKT_SEND, conn, buf->pkt); #endif + struct icpkthdr *pkt_ = buf->pkt; + pkt_->send_time = now; + pkt_->recv_time = 0; + pkt_->retry_times = buf->nRetry; sendOnce(transportStates, pEntry, buf, &conn->mConn); ic_statistics.sndPktNum++; @@ -5245,7 +5774,7 @@ handleAckForDisorderPkt(ChunkTransportState *transportStates, if (buf->pkt->seq == pkt->seq) { - handleAckedPacket(&conn->mConn, buf, now); + handleAckedPacket(&conn->mConn, buf, now, pkt); shouldSendBuffers = true; break; } @@ -5255,7 +5784,7 @@ handleAckForDisorderPkt(ChunkTransportState *transportStates, /* this is a lost packet, retransmit */ buf->nRetry++; - if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS) + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS || Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) { buf = icBufferListDelete(&unack_queue_ring.slots[buf->unackQueueRingSlot], buf); putIntoUnackQueueRing(&unack_queue_ring, buf, @@ -5284,7 +5813,7 @@ handleAckForDisorderPkt(ChunkTransportState *transportStates, /* remove packet already received. */ next = link->next; - handleAckedPacket(&conn->mConn, buf, now); + handleAckedPacket(&conn->mConn, buf, now, pkt); shouldSendBuffers = true; link = next; buf = GET_ICBUFFER_FROM_PRIMARY(link); @@ -5301,7 +5830,7 @@ handleAckForDisorderPkt(ChunkTransportState *transportStates, lostPktCnt--; } } - if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS) + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS || Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) { snd_control_info.ssthresh = Max(snd_control_info.cwnd / 2, snd_control_info.minCwnd); snd_control_info.cwnd = snd_control_info.ssthresh; @@ -5354,7 +5883,7 @@ handleAckForDuplicatePkt(MotionConn *mConn, icpkthdr *pkt) while (!icBufferListIsHead(&conn->unackQueue, link) && (buf->pkt->seq <= pkt->extraSeq)) { next = link->next; - handleAckedPacket(&conn->mConn, buf, now); + handleAckedPacket(&conn->mConn, buf, now, pkt); shouldSendBuffers = true; link = next; buf = GET_ICBUFFER_FROM_PRIMARY(link); @@ -5366,7 +5895,7 @@ handleAckForDuplicatePkt(MotionConn *mConn, icpkthdr *pkt) next = link->next; if (buf->pkt->seq == pkt->seq) { - handleAckedPacket(&conn->mConn, buf, now); + handleAckedPacket(&conn->mConn, buf, now, pkt); shouldSendBuffers = true; break; } @@ -5448,55 +5977,230 @@ checkExpiration(ChunkTransportState *transportStates, uint64 now) { /* check for expiration */ - int count = 0; - int retransmits = 0; + int count = 0; + int retransmits = 0; MotionConnUDP *currBuffConn = NULL; Assert(unack_queue_ring.currentTime != 0); - while (now >= (unack_queue_ring.currentTime + TIMER_SPAN) && count++ < UNACK_QUEUE_RING_SLOTS_NUM) + + if (unlikely(Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER)) { - /* expired, need to resend them */ - ICBuffer *curBuf = NULL; + checkRtmTimeout(&mudp, now, 500, transportStates, pEntry, triggerConn); + return; + } - while ((curBuf = icBufferListPop(&unack_queue_ring.slots[unack_queue_ring.idx])) != NULL) + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) + { + uint64 timer_span_time = unack_queue_ring.currentTime + TIMER_SPAN_LOSS; + + while (now >= (timer_span_time + unack_queue_ring.time_difference) && count++ < UNACK_QUEUE_RING_SLOTS_NUM) { - curBuf->nRetry++; - putIntoUnackQueueRing( - &unack_queue_ring, - curBuf, - computeExpirationPeriod(curBuf->conn, curBuf->nRetry), now); + /* expired, need to resend them */ + ICBuffer *curBuf = NULL; + + while ((curBuf = icBufferListPop(&unack_queue_ring.slots[unack_queue_ring.idx])) != NULL) + { + MotionConnUDP *conn = NULL; + conn = CONTAINER_OF(curBuf->conn, MotionConnUDP, mConn); + curBuf->nRetry++; + + /* + * Fixed Timeout Thresholds: Traditional TCP-style Retransmission Timeout + * (RTTVAR.RTO) calculations may be too rigid for networks with volatile + * latency. This leads to: + * Premature Retransmissions: Unnecessary data resends during temporary + * latency spikes, wasting bandwidth. + * Delayed Recovery: Slow reaction to actual packet loss when RTO is + * overly conservative. + * + * Lack of Context Awareness: Static RTO ignores real-time network behavior + * patterns, reducing throughput and responsiveness. + * + * Solution: Dynamic Timeout Threshold Adjustment + * Implements an adaptive timeout mechanism to optimize retransmission: + * if (now < (curBuf->sentTime + conn->rttvar.rto)) { + * uint32_t diff = (curBuf->sentTime + conn->rttvar.rto) - now; + * // ... (statistical tracking and threshold adjustment) + * } + * Temporary Latency Spike: Uses max (conservative) to avoid false + * retransmits, reducing bandwidth waste (vs. traditional mistaken + * retransmissions). + * Persistent Packet Loss: Prioritizes min (aggressive) via + * weight_retrans, accelerating recovery (vs. slow fixed-RTO reaction). + * Stable Network: Balances weights for equilibrium throughput (vs. + * static RTO limitations). + */ + if (now < (curBuf->sentTime + conn->rttvar.rto)) + { +#ifdef TIMEOUT_Z + uint32_t diff = (curBuf->sentTime + conn->rttvar.rto) - now; + if(unack_queue_ring.retrans_count == 0 && unack_queue_ring.no_retrans_count == 0) + { + unack_queue_ring.min = diff; + unack_queue_ring.max = diff; + } + + if (diff < unack_queue_ring.min) unack_queue_ring.min = diff; + if (diff > unack_queue_ring.max) unack_queue_ring.max = diff; + + if (unack_queue_ring.retrans_count == 0) + unack_queue_ring.time_difference = unack_queue_ring.max; + else if (unack_queue_ring.no_retrans_count == 0 && ic_statistics.retransmits < (Gp_interconnect_min_retries_before_timeout / 4)) + unack_queue_ring.time_difference = 0; + else + { + uint32_t total_count = unack_queue_ring.retrans_count + unack_queue_ring.no_retrans_count; + double weight_retrans = (double)unack_queue_ring.retrans_count / total_count; + double weight_no_retrans = (double)unack_queue_ring.no_retrans_count / total_count; + unack_queue_ring.time_difference = (uint32_t)(unack_queue_ring.max * weight_no_retrans + unack_queue_ring.min * weight_retrans); + } + + ++unack_queue_ring.no_retrans_count; + } + else + ++unack_queue_ring.retrans_count; +#endif #ifdef TRANSFER_PROTOCOL_STATS - updateStats(TPE_DATA_PKT_SEND, curBuf->conn, curBuf->pkt); + updateStats(TPE_DATA_PKT_SEND, curBuf->conn, curBuf->pkt); #endif + ChunkTransportStateEntryUDP *pEntryUdp; + pEntryUdp = CONTAINER_OF(pEntry, ChunkTransportStateEntryUDP, entry); + putIntoUnackQueueRing(&unack_queue_ring, + curBuf, + computeExpirationPeriod(curBuf->conn, curBuf->nRetry), getCurrentTime()); + struct icpkthdr *pkt_ = curBuf->pkt; - sendOnce(transportStates, pEntry, curBuf, curBuf->conn); + pkt_->send_time = getCurrentTime(); + pkt_->recv_time = 0; + pkt_->retry_times = curBuf->nRetry; - currBuffConn = CONTAINER_OF(curBuf->conn, MotionConnUDP, mConn); + sendOnce(transportStates, pEntry, curBuf, curBuf->conn); - retransmits++; - ic_statistics.retransmits++; - currBuffConn->stat_count_resent++; - currBuffConn->stat_max_resent = Max(currBuffConn->stat_max_resent, - currBuffConn->stat_count_resent); + /* + * Adaptive Retry Backoff with Polling for Network Asymmetry Mitigation + * + * This logic addresses two critical network pathologies: + * 1. RTO Distortion Amplification: + * - Packet loss in volatile networks causes RTO-based retransmission errors + * - Multiple spurious retries increase network load and congestion collapse risk + * 2. Data Skew-Induced Starvation: + * - Under unbalanced workloads, low-traffic nodes experience MON (Message Order Number) delays + * - Delayed ACKs trigger false retransmissions even when packets arrive eventually + * - Unacked queue inflation worsens congestion in high-traffic nodes + */ + int32_t loop_ack = curBuf->nRetry; + uint32_t rto_min = UDP_RTO_MIN / 10; + uint32_t rtoMs = conn->rttvar.rto / 1000; + int32_t wait_time = rto_min > rtoMs ? rto_min : rtoMs; + int32_t loop = 0; - checkNetworkTimeout(curBuf, now, &transportStates->networkTimeoutIsLogged); + /* + * To optimize performance, we need to process all the time-out file descriptors (fds) + * in each batch together. + */ + if (loop_ack > 0) + { + while (loop++ < loop_ack) + { + if (pollAcks(transportStates, pEntryUdp->txfd, wait_time)) + { + handleAcks(transportStates, pEntry, false); + curBuf->nRetry = 0; + break; + } + + struct icpkthdr *pkt_ = curBuf->pkt; + pkt_->send_time = getCurrentTime(); + pkt_->recv_time = 0; + pkt_->retry_times = curBuf->nRetry; + + sendOnce(transportStates, pEntry, curBuf, curBuf->conn); + + if (loop_ack < (Gp_interconnect_min_retries_before_timeout / 10)) + wait_time += wait_time / 10; + else if (loop_ack > (Gp_interconnect_min_retries_before_timeout / 10) && loop_ack < (Gp_interconnect_min_retries_before_timeout / 5)) + wait_time += RTO_MAX / 10; + else if (loop_ack > (Gp_interconnect_min_retries_before_timeout / 5) && loop_ack < (Gp_interconnect_min_retries_before_timeout / 2)) + wait_time += RTO_MAX / 5; + else if (loop_ack < (Gp_interconnect_min_retries_before_timeout)) + wait_time += RTO_MAX; + }; + } + + if (loop_ack > Gp_interconnect_min_retries_before_timeout / 5) + write_log("Resending packet (seq %d) to %s (pid %d cid %d) with %d retries in %lu seconds", + curBuf->pkt->seq, curBuf->conn->remoteHostAndPort, + curBuf->pkt->dstPid, curBuf->pkt->dstContentId, curBuf->nRetry, + (now - curBuf->sentTime) / 1000 / 1000); + + currBuffConn = CONTAINER_OF(curBuf->conn, MotionConnUDP, mConn); + + retransmits++; + ic_statistics.retransmits++; + currBuffConn->stat_count_resent++; + currBuffConn->stat_max_resent = Max(currBuffConn->stat_max_resent, + currBuffConn->stat_count_resent); + + checkNetworkTimeout(curBuf, now, &transportStates->networkTimeoutIsLogged); #ifdef AMS_VERBOSE_LOGGING - write_log("RESEND pkt with seq %d (retry %d, rtt " UINT64_FORMAT ") to route %d", - curBuf->pkt->seq, curBuf->nRetry, currBuffConn->rtt, currBuffConn->route); - logPkt("RESEND PKT in checkExpiration", curBuf->pkt); + write_log("RESEND pkt with seq %d (retry %d, rtt " UINT64_FORMAT ") to route %d", + curBuf->pkt->seq, curBuf->nRetry, currBuffConn->rtt, currBuffConn->route); + logPkt("RESEND PKT in checkExpiration", curBuf->pkt); #endif + } + + timer_span_time += TIMER_SPAN_LOSS; + unack_queue_ring.idx = (unack_queue_ring.idx + 1) % (UNACK_QUEUE_RING_SLOTS_NUM); } + } + else + { + while (now >= (unack_queue_ring.currentTime + TIMER_SPAN) && count++ < UNACK_QUEUE_RING_SLOTS_NUM) + { + /* expired, need to resend them */ + ICBuffer *curBuf = NULL; - unack_queue_ring.currentTime += TIMER_SPAN; - unack_queue_ring.idx = (unack_queue_ring.idx + 1) % (UNACK_QUEUE_RING_SLOTS_NUM); + while ((curBuf = icBufferListPop(&unack_queue_ring.slots[unack_queue_ring.idx])) != NULL) + { + curBuf->nRetry++; + putIntoUnackQueueRing( + &unack_queue_ring, + curBuf, + computeExpirationPeriod(curBuf->conn, curBuf->nRetry), now); + +#ifdef TRANSFER_PROTOCOL_STATS + updateStats(TPE_DATA_PKT_SEND, curBuf->conn, curBuf->pkt); +#endif + + sendOnce(transportStates, pEntry, curBuf, curBuf->conn); + + currBuffConn = CONTAINER_OF(curBuf->conn, MotionConnUDP, mConn); + + retransmits++; + ic_statistics.retransmits++; + currBuffConn->stat_count_resent++; + currBuffConn->stat_max_resent = Max(currBuffConn->stat_max_resent, currBuffConn->stat_count_resent); + checkNetworkTimeout(curBuf, now, &transportStates->networkTimeoutIsLogged); + +#ifdef AMS_VERBOSE_LOGGING + write_log("RESEND pkt with seq %d (retry %d, rtt " UINT64_FORMAT ") to route %d", + curBuf->pkt->seq, curBuf->nRetry, curBuf->conn->rtt, curBuf->conn->route); + logPkt("RESEND PKT in checkExpiration", curBuf->pkt); +#endif + } + + unack_queue_ring.currentTime += TIMER_SPAN; + unack_queue_ring.idx = (unack_queue_ring.idx + 1) % (UNACK_QUEUE_RING_SLOTS_NUM); + } + + /* + * deal with case when there is a long time this function is not called. + */ + unack_queue_ring.currentTime = now - (now % (TIMER_SPAN)); } - /* - * deal with case when there is a long time this function is not called. - */ - unack_queue_ring.currentTime = now - (now % TIMER_SPAN); if (retransmits > 0) { snd_control_info.ssthresh = Max(snd_control_info.cwnd / 2, snd_control_info.minCwnd); @@ -5524,7 +6228,7 @@ checkExpiration(ChunkTransportState *transportStates, * */ static void -checkDeadlock(ChunkTransportStateEntry *pChunkEntry, MotionConn *mConn) +checkDeadlock(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, MotionConn *mConn) { uint64 deadlockCheckTime; ChunkTransportStateEntryUDP *pEntry = NULL; @@ -5561,17 +6265,31 @@ checkDeadlock(ChunkTransportStateEntry *pChunkEntry, MotionConn *mConn) ic_control_info.lastDeadlockCheckTime = now; ic_statistics.statusQueryMsgNum++; + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE && pollAcks(transportStates, pEntry->txfd, 50)) + { + handleAcks(transportStates, pChunkEntry, false); + conn->deadlockCheckBeginTime = now; + } + /* check network error. */ - if ((now - conn->deadlockCheckBeginTime) > ((uint64) Gp_interconnect_transmit_timeout * 1000 * 1000)) + if ((now - conn->deadlockCheckBeginTime) > ((uint64) Gp_interconnect_transmit_timeout * 100 * 1000)) { - ereport(ERROR, - (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), - errmsg("interconnect encountered a network error, please check your network"), - errdetail("Did not get any response from %s (pid %d cid %d) in %d seconds.", - conn->mConn.remoteHostAndPort, - conn->conn_info.dstPid, - conn->conn_info.dstContentId, - Gp_interconnect_transmit_timeout))); + write_log("Did not get any response from %s (pid %d cid %d) in 600 seconds.",conn->mConn.remoteHostAndPort, + conn->conn_info.dstPid, + conn->conn_info.dstContentId); + + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER) + conn->capacity += 1; + + if ((now - conn->deadlockCheckBeginTime) > ((uint64) Gp_interconnect_transmit_timeout * 1000 * 1000)) + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect encountered a network error, please check your network"), + errdetail("Did not get any response from %s (pid %d cid %d) in %d seconds.", + conn->mConn.remoteHostAndPort, + conn->conn_info.dstPid, + conn->conn_info.dstContentId, + Gp_interconnect_transmit_timeout))); } } } @@ -5690,7 +6408,7 @@ checkExceptions(ChunkTransportState *transportStates, checkExpirationCapacityFC(transportStates, pEntry, conn, timeout); } - if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS) + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS || Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) { uint64 now = getCurrentTime(); @@ -5703,7 +6421,7 @@ checkExceptions(ChunkTransportState *transportStates, if ((retry & 0x3) == 2) { - checkDeadlock(pEntry, conn); + checkDeadlock(transportStates, pEntry, conn); checkRxThreadError(); ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); } @@ -5735,14 +6453,24 @@ static inline int computeTimeout(MotionConn *mConn, int retry) { MotionConnUDP *conn = NULL; + uint32_t rtoMs = 0; conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + rtoMs = conn->rttvar.rto / 1000; if (icBufferListLength(&conn->unackQueue) == 0) return TIMER_CHECKING_PERIOD; ICBufferLink *bufLink = icBufferListFirst(&conn->unackQueue); ICBuffer *buf = GET_ICBUFFER_FROM_PRIMARY(bufLink); + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) + { + if (buf->nRetry == 0 && retry == 0 && unack_queue_ring.numSharedOutStanding < (snd_control_info.cwnd - snd_control_info.minCwnd)) + return 0; + + return rtoMs > TIMER_CHECKING_PERIOD ? rtoMs: TIMER_CHECKING_PERIOD; + } + if (buf->nRetry == 0 && retry == 0) return 0; @@ -5830,7 +6558,7 @@ SendChunkUDPIFC(ChunkTransportState *transportStates, if (pollAcks(transportStates, pEntry->txfd, timeout)) { - if (handleAcks(transportStates, &pEntry->entry)) + if (handleAcks(transportStates, &pEntry->entry, true)) { /* * We make sure that we deal with the stop messages only after @@ -5843,6 +6571,9 @@ SendChunkUDPIFC(ChunkTransportState *transportStates, } checkExceptions(transportStates, &pEntry->entry, &conn->mConn, retry++, timeout); doCheckExpiration = false; + + if (!doCheckExpiration && icBufferListLength(&conn->unackQueue) == 0 && conn->capacity > 0 && icBufferListLength(&conn->sndQueue) > 0) + sendBuffers(transportStates, &pEntry->entry, &conn->mConn); } conn->mConn.pBuff = (uint8 *) conn->curBuff->pkt; @@ -5987,12 +6718,15 @@ SendEOSUDPIFC(ChunkTransportState *transportStates, timeout = computeTimeout(&conn->mConn, retry); if (pollAcks(transportStates, pEntry->txfd, timeout)) - handleAcks(transportStates, &pEntry->entry); - + handleAcks(transportStates, &pEntry->entry, true); checkExceptions(transportStates, &pEntry->entry, &conn->mConn, retry++, timeout); if (retry >= MAX_TRY) + { + if (icBufferListLength(&conn->unackQueue) == 0) + sendBuffers(transportStates, &pEntry->entry, &conn->mConn); break; + } } if ((!conn->mConn.cdbProc) || (icBufferListLength(&conn->unackQueue) == 0 && @@ -6217,24 +6951,60 @@ getCurrentTime(void) static void putIntoUnackQueueRing(UnackQueueRing *uqr, ICBuffer *buf, uint64 expTime, uint64 now) { + MotionConnUDP *buffConn = NULL; + buffConn = CONTAINER_OF(buf->conn, MotionConnUDP, mConn); uint64 diff = 0; int idx = 0; - - /* The first packet, currentTime is not initialized */ - if (uqr->currentTime == 0) - uqr->currentTime = now - (now % TIMER_SPAN); - - diff = now + expTime - uqr->currentTime; - if (diff >= UNACK_QUEUE_RING_LENGTH) + + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) { + /* The first packet, currentTime is not initialized */ +#ifndef TIMEOUT_Z + if (uqr->currentTime == 0) + uqr->currentTime = now - (now % TIMER_SPAN_LOSS); +#else + if (uqr->currentTime == 0 && buffConn->rttvar.rto == 0) + uqr->currentTime = now - (now % TIMER_SPAN_LOSS); + else + uqr->currentTime = now + buffConn->rttvar.rto; + +#endif + diff = expTime; + if (diff >= UNACK_QUEUE_RING_LENGTH_LOSS) + { #ifdef AMS_VERBOSE_LOGGING - write_log("putIntoUnackQueueRing:" "now " UINT64_FORMAT "expTime " UINT64_FORMAT "diff " UINT64_FORMAT "uqr-currentTime " UINT64_FORMAT, now, expTime, diff, uqr->currentTime); + write_log("putIntoUnackQueueRing:" "now " UINT64_FORMAT "expTime " UINT64_FORMAT "diff " UINT64_FORMAT "uqr-currentTime " UINT64_FORMAT, now, expTime, diff, uqr->currentTime); #endif - diff = UNACK_QUEUE_RING_LENGTH - 1; + diff = UNACK_QUEUE_RING_LENGTH_LOSS - 1; + } + else if (diff < TIMER_SPAN_LOSS) + { + diff = diff < TIMER_SPAN_LOSS ? TIMER_SPAN_LOSS : diff; + } } - else if (diff < TIMER_SPAN) + else { - diff = TIMER_SPAN; + if (uqr->currentTime == 0) + uqr->currentTime = now - (now % TIMER_SPAN_LOSS); + + diff = now + expTime - uqr->currentTime; + if (diff >= UNACK_QUEUE_RING_LENGTH) + { +#ifdef AMS_VERBOSE_LOGGING + write_log("putIntoUnackQueueRing:" "now " UINT64_FORMAT "expTime " UINT64_FORMAT "diff " UINT64_FORMAT "uqr-currentTime " UINT64_FORMAT, now, expTime, diff, uqr->currentTime); +#endif + diff = UNACK_QUEUE_RING_LENGTH - 1; + } + else if (diff < TIMER_SPAN) + { + diff = TIMER_SPAN; + } + + idx = (uqr->idx + diff / TIMER_SPAN) % UNACK_QUEUE_RING_SLOTS_NUM; + +#ifdef AMS_VERBOSE_LOGGING + write_log("PUTTW: curtime " UINT64_FORMAT " now " UINT64_FORMAT " (diff " UINT64_FORMAT ") expTime " UINT64_FORMAT " previdx %d, nowidx %d, nextidx %d", uqr->currentTime, now, diff, expTime, buf->unackQueueRingSlot, uqr->idx, idx); +#endif } idx = (uqr->idx + diff / TIMER_SPAN) % UNACK_QUEUE_RING_SLOTS_NUM; @@ -6397,6 +7167,30 @@ handleDataPacket(MotionConn *mConn, icpkthdr *pkt, struct sockaddr_storage *peer logPkt("Interconnect error: received a packet when the queue is full ", pkt); ic_statistics.disorderedPktNum++; conn->stat_count_dropped++; + + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER && rx_control_info.mainWaitingState.waiting && + rx_control_info.mainWaitingState.waitingNode == pkt->motNodeId && + rx_control_info.mainWaitingState.waitingQuery == pkt->icId) + { + if (rx_control_info.mainWaitingState.waitingRoute == ANY_ROUTE) + { + if (rx_control_info.mainWaitingState.reachRoute == ANY_ROUTE) + rx_control_info.mainWaitingState.reachRoute = conn->route; + } + else if (rx_control_info.mainWaitingState.waitingRoute == conn->route) + { + if (DEBUG2 >= log_min_messages) + write_log("rx thread: main_waiting waking it route %d", rx_control_info.mainWaitingState.waitingRoute); + rx_control_info.mainWaitingState.reachRoute = conn->route; + } + /* WAKE MAIN THREAD HERE */ + *wakeup_mainthread = true; + } + + if (Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE) + { + setAckSendParam(param, &conn->mConn, UDPIC_FLAGS_FULL, conn->conn_info.seq - 1, conn->conn_info.extraSeq); + } return false; } @@ -6681,9 +7475,25 @@ rxThreadFunc(void *arg) if (conn != NULL) { + uint64 now = getCurrentTime(); + uint64 send_time = pkt->send_time; + uint64 recv_time = now; + uint64 retry_times = pkt->retry_times; + MotionConnUDP *connUdp = NULL; + + connUdp = CONTAINER_OF(conn, MotionConnUDP, mConn); + bool drop_ack = pkt->seq < connUdp->conn_info.seq ? true : false; /* Handling a regular packet */ if (handleDataPacket(conn, pkt, &peer, &peerlen, ¶m, &wakeup_mainthread)) pkt = NULL; + if (!pkt) + { + param.msg.send_time = send_time; + param.msg.recv_time = recv_time; + param.msg.retry_times = retry_times; + } + if (drop_ack) + param.msg.retry_times = Gp_interconnect_min_retries_before_timeout; ic_statistics.recvPktNum++; } else diff --git a/contrib/interconnect/udp/ic_udpifc.h b/contrib/interconnect/udp/ic_udpifc.h index 76403abb3f3..af3ca72ba3b 100644 --- a/contrib/interconnect/udp/ic_udpifc.h +++ b/contrib/interconnect/udp/ic_udpifc.h @@ -90,6 +90,9 @@ typedef struct icpkthdr */ uint32 seq; uint32 extraSeq; + uint64_t send_time; + uint64_t recv_time; + uint8_t retry_times; } icpkthdr; typedef struct ICBuffer ICBuffer; diff --git a/contrib/pax_storage/.gitignore b/contrib/pax_storage/.gitignore index 51a328f84e0..87aa2a4a742 100644 --- a/contrib/pax_storage/.gitignore +++ b/contrib/pax_storage/.gitignore @@ -12,6 +12,7 @@ Thumbs.db # Temp files dir +bench_data .tmp/** build*/** results/** diff --git a/contrib/pax_storage/CMakeLists.txt b/contrib/pax_storage/CMakeLists.txt index f4c132c1bd7..e45eab560e6 100644 --- a/contrib/pax_storage/CMakeLists.txt +++ b/contrib/pax_storage/CMakeLists.txt @@ -21,7 +21,10 @@ set(CMAKE_CXX_STANDARD 17) set(TOP_DIR ${PROJECT_SOURCE_DIR}/../..) set(CBDB_INCLUDE_DIR ${TOP_DIR}/src/include) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror -Wno-unused-function -Wno-redundant-move -Wno-error=redundant-move -Wno-error=ignored-qualifiers -Wuninitialized -Winit-self -Wstrict-aliasing -Wno-missing-field-initializers -Wno-unused-parameter -Wno-clobbered -Wno-sized-deallocation -g") +# Base CXX flags +# Note: -Wpessimizing-move is enabled by default in GCC 9+ and will be caught by -Werror +# No need to explicitly add -Werror=pessimizing-move (which breaks GCC 8.x compatibility) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror -Wno-unused-function -Wno-error=ignored-qualifiers -Wno-error=array-bounds -Wuninitialized -Winit-self -Wstrict-aliasing -Wno-missing-field-initializers -Wno-unused-parameter -Wno-clobbered -Wno-sized-deallocation -g") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unused-parameter -Wno-parameter-name") option(USE_MANIFEST_API "Use manifest API" OFF) @@ -29,7 +32,7 @@ option(USE_PAX_CATALOG "Use manifest API, by pax impl" ON) # Build gtest options option(BUILD_GTEST "Build with google test" ON) -option(BUILD_GBENCH "Build with google benchmark" ON) +option(BUILD_GBENCH "Build with google benchmark" OFF) # Build pax tools option(BUILD_TOOLS "Build with pax tools" OFF) diff --git a/contrib/pax_storage/doc/README.md b/contrib/pax_storage/doc/README.md index b4561e479e5..43393f2f859 100644 --- a/contrib/pax_storage/doc/README.md +++ b/contrib/pax_storage/doc/README.md @@ -42,10 +42,11 @@ PAX has the following features: PAX will be built with `--enable-pax` when you build the Cloudberry. Dependency requirements are as follows: -- **C/C++ Compiler**: GCC/GCC-C++ 11 or later +- **C/C++ Compiler**: GCC/GCC-C++ 8 or later - **CMake**: 3.11 or later - **Protobuf**: 3.5.0 or later - **ZSTD (libzstd)**: 1.4.0 or later +- **liburing**: 2.12 or later Also, you need to run the following command at the top level of the Cloudberry source code directory to download the submodules: @@ -164,14 +165,14 @@ For AM(access methods) in Cloudberry, each AM has customized relation options. U | Name | Type | Optional | Default | Description | | :-----: | :----: | :---- | :---- | :---- | -| pax_enable_sparse_filter | bool | `on`/`off` | `on` | Specifies whether to enable sparse filtering based on statistics. | -| pax_enable_row_filter | bool | `on`/`off` | `off` | Specifies whether to enable row filtering. | -| pax_scan_reuse_buffer_size | int | [1048576, 33554432] | 8388608 | The buffer block size used during scanning. | -| pax_max_tuples_per_group | int | [5, 524288] | 131072 | Specifies the maximum number of tuples allowed in each group. | -| pax_max_tuples_per_file | int | [131072, 8388608] | 1310720 | Specifies the maximum number of tuples allowed in each data file. | -| pax_max_size_per_file | int | [8388608, 335544320] | 67108864 | The maximum physical size allowed for each data file. The default value is 67108864 (64MiB). The actual file size might be slightly larger than the set size. Very large or small values might negatively impact performance. | -| pax_enable_toast | bool | `on`/`off` | `on` | Specifies whether to enable TOAST support. | -| pax_min_size_of_compress_toast | int | [524288, 1073741824] | 524288 | Specifies the threshold for creating compressed TOAST tables. If the character length exceeds this threshold, Cloudberry creates compressed TOAST tables for storage. | -| pax_min_size_of_external_toast | int | [10485760, 2147483647] | 10485760 | Specifies the threshold for creating external TOAST tables. If the character length exceeds this threshold, Cloudberry creates external TOAST tables for storage. | -| pax_default_storage_format | string | `porc`/`porc_vec` | `porc` | Controls the default storage format. | -| pax_bloom_filter_work_memory_bytes | int | [1024, 2147483647] | 10240 | Controls the maximum memory allowed for bloom filter usage. | +| pax.enable_sparse_filter | bool | `on`/`off` | `on` | Specifies whether to enable sparse filtering based on statistics. | +| pax.enable_row_filter | bool | `on`/`off` | `off` | Specifies whether to enable row filtering. | +| pax.scan_reuse_buffer_size | int | [1048576, 33554432] | 8388608 | The buffer block size used during scanning. | +| pax.max_tuples_per_group | int | [5, 524288] | 131072 | Specifies the maximum number of tuples allowed in each group. | +| pax.max_tuples_per_file | int | [131072, 8388608] | 1310720 | Specifies the maximum number of tuples allowed in each data file. | +| pax.max_size_per_file | int | [8388608, 335544320] | 67108864 | The maximum physical size allowed for each data file. The default value is 67108864 (64MiB). The actual file size might be slightly larger than the set size. Very large or small values might negatively impact performance. | +| pax.enable_toast | bool | `on`/`off` | `on` | Specifies whether to enable TOAST support. | +| pax.min_size_of_compress_toast | int | [524288, 1073741824] | 524288 | Specifies the threshold for creating compressed TOAST tables. If the character length exceeds this threshold, Cloudberry creates compressed TOAST tables for storage. | +| pax.min_size_of_external_toast | int | [10485760, 2147483647] | 10485760 | Specifies the threshold for creating external TOAST tables. If the character length exceeds this threshold, Cloudberry creates external TOAST tables for storage. | +| pax.default_storage_format | string | `porc`/`porc_vec` | `porc` | Controls the default storage format. | +| pax.bloom_filter_work_memory_bytes | int | [1024, 2147483647] | 10240 | Controls the maximum memory allowed for bloom filter usage. | diff --git a/contrib/pax_storage/doc/README.toast.md b/contrib/pax_storage/doc/README.toast.md index 631680a7a9b..44fb084d5e5 100644 --- a/contrib/pax_storage/doc/README.toast.md +++ b/contrib/pax_storage/doc/README.toast.md @@ -60,7 +60,7 @@ Compress toast is consistent with Cloudberry compress toast. This part of the data is a varlena structure, and it is of type `varattrib_4b`, which means that it has a varlena head and the storage range is less than 1G. -In addition, the lower limit of the datum length that needs to be compressed on PAX may be set to larger, and TOAST_TUPLE_THRESHOLD is no longer used as the threshold. Instead, we added a GUC(`pax_min_size_of_compress_toast`) to use. +In addition, the lower limit of the datum length that needs to be compressed on PAX may be set to larger, and TOAST_TUPLE_THRESHOLD is no longer used as the threshold. Instead, we added a GUC(`pax.min_size_of_compress_toast`) to use. ``` typedef union diff --git a/contrib/pax_storage/expected/cluster.out b/contrib/pax_storage/expected/cluster.out index 792c3503a73..c541db45bc2 100644 --- a/contrib/pax_storage/expected/cluster.out +++ b/contrib/pax_storage/expected/cluster.out @@ -1,4 +1,4 @@ -set pax_max_tuples_per_file to 131072; +set pax.max_tuples_per_file to 131072; -- cluster table using index -- start_ignore drop table if EXISTS t_index_cluster; @@ -135,7 +135,7 @@ select ptblockname,ptstatistics,ptisclustered from get_pax_aux_table('t_zorder_c drop table t_zorder_cluster; -- test cluster index -set pax_max_tuples_per_file to 131072; +set pax.max_tuples_per_file to 131072; drop table if EXISTS t_index_cluster; create table t_index_cluster(c1 int, c2 int) with (minmax_columns='c1,c2'); \d+ t_index_cluster; diff --git a/contrib/pax_storage/expected/filter.out b/contrib/pax_storage/expected/filter.out index f182c041a31..44f38a36038 100644 --- a/contrib/pax_storage/expected/filter.out +++ b/contrib/pax_storage/expected/filter.out @@ -1,5 +1,5 @@ -set pax_enable_debug to on; -set pax_enable_sparse_filter = on; +set pax.enable_debug to on; +set pax.enable_sparse_filter = on; create table pax_test.null_test_t(a int, b int, c text) using pax; insert into pax_test.null_test_t(a) select null from generate_series(1,2)i; insert into pax_test.null_test_t select 1, i, 'cc_' || i from generate_series(1,2)i; @@ -222,4 +222,4 @@ kind group, filter rate: 0 / 1 reset client_min_messages; drop table pax_test.in_test_t; -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/expected/filter_1.out b/contrib/pax_storage/expected/filter_1.out index 6b45c3be862..d2124c5491c 100644 --- a/contrib/pax_storage/expected/filter_1.out +++ b/contrib/pax_storage/expected/filter_1.out @@ -1,5 +1,5 @@ -set pax_enable_debug to on; -set pax_enable_sparse_filter = on; +set pax.enable_debug to on; +set pax.enable_sparse_filter = on; create table pax_test.null_test_t(a int, b int, c text) using pax; insert into pax_test.null_test_t(a) select null from generate_series(1,2)i; insert into pax_test.null_test_t select 1, i, 'cc_' || i from generate_series(1,2)i; @@ -222,4 +222,4 @@ kind group, filter rate: 0 / 1 reset client_min_messages; drop table pax_test.in_test_t; -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/expected/filter_tree.out b/contrib/pax_storage/expected/filter_tree.out index a7115efc1e9..fabf6729d71 100644 --- a/contrib/pax_storage/expected/filter_tree.out +++ b/contrib/pax_storage/expected/filter_tree.out @@ -5,8 +5,8 @@ -- m/No sparse filter/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create or replace function intrc(iint int) returns int as $$ begin return iint; end; @@ -530,8 +530,8 @@ LOG: statement: select count(*) from t1 where intrc(v1) + 10 > v2; (1 row) -- simply the filter tree -set pax_log_filter_tree to on; -LOG: statement: set pax_log_filter_tree to on; +set pax.log_filter_tree to on; +LOG: statement: set pax.log_filter_tree to on; select count(*) from t1 where v1 > 10 or intrc(v2) < 120; LOG: statement: select count(*) from t1 where v1 > 10 or intrc(v2) < 120; LOG: Origin filter tree: @@ -653,8 +653,8 @@ NullTestNode(flags=129) 200 (1 row) -reset pax_log_filter_tree; -LOG: statement: reset pax_log_filter_tree; +reset pax.log_filter_tree; +LOG: statement: reset pax.log_filter_tree; reset client_min_messages; LOG: statement: reset client_min_messages; -- IN && bloom filter diff --git a/contrib/pax_storage/expected/filter_tree_1.out b/contrib/pax_storage/expected/filter_tree_1.out index 4739ac6bb3b..3e158df074b 100644 --- a/contrib/pax_storage/expected/filter_tree_1.out +++ b/contrib/pax_storage/expected/filter_tree_1.out @@ -5,8 +5,8 @@ -- m/No sparse filter/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create or replace function intrc(iint int) returns int as $$ begin return iint; end; @@ -530,8 +530,8 @@ LOG: statement: select count(*) from t1 where intrc(v1) + 10 > v2; (1 row) -- simply the filter tree -set pax_log_filter_tree to on; -LOG: statement: set pax_log_filter_tree to on; +set pax.log_filter_tree to on; +LOG: statement: set pax.log_filter_tree to on; select count(*) from t1 where v1 > 10 or intrc(v2) < 120; LOG: statement: select count(*) from t1 where v1 > 10 or intrc(v2) < 120; LOG: Origin filter tree: @@ -660,8 +660,8 @@ NullTestNode(flags=129) 200 (1 row) -reset pax_log_filter_tree; -LOG: statement: reset pax_log_filter_tree; +reset pax.log_filter_tree; +LOG: statement: reset pax.log_filter_tree; reset client_min_messages; LOG: statement: reset client_min_messages; -- IN && bloom filter diff --git a/contrib/pax_storage/expected/filter_tree_arithmetic.out b/contrib/pax_storage/expected/filter_tree_arithmetic.out index 4486c647d6d..d1c49823f9b 100644 --- a/contrib/pax_storage/expected/filter_tree_arithmetic.out +++ b/contrib/pax_storage/expected/filter_tree_arithmetic.out @@ -5,8 +5,8 @@ -- m/No sparse filter/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create table t_arithmetic(same int, v1 int, v2 int, v3 int) using pax with (minmax_columns='v1,v2,v3'); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'same' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. diff --git a/contrib/pax_storage/expected/filter_tree_arithmetic_1.out b/contrib/pax_storage/expected/filter_tree_arithmetic_1.out index b333d2bf625..b3fb2cb8e92 100644 --- a/contrib/pax_storage/expected/filter_tree_arithmetic_1.out +++ b/contrib/pax_storage/expected/filter_tree_arithmetic_1.out @@ -5,8 +5,8 @@ -- m/No sparse filter/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create table t_arithmetic(same int, v1 int, v2 int, v3 int) using pax with (minmax_columns='v1,v2,v3'); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'same' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. diff --git a/contrib/pax_storage/expected/filter_tree_optimizer.out b/contrib/pax_storage/expected/filter_tree_optimizer.out index 6f07519949f..437408e52b6 100644 --- a/contrib/pax_storage/expected/filter_tree_optimizer.out +++ b/contrib/pax_storage/expected/filter_tree_optimizer.out @@ -5,8 +5,8 @@ -- m/No sparse filter/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create or replace function intrc(iint int) returns int as $$ begin return iint; end; @@ -530,8 +530,8 @@ LOG: statement: select count(*) from t1 where intrc(v1) + 10 > v2; (1 row) -- simply the filter tree -set pax_log_filter_tree to on; -LOG: statement: set pax_log_filter_tree to on; +set pax.log_filter_tree to on; +LOG: statement: set pax.log_filter_tree to on; select count(*) from t1 where v1 > 10 or intrc(v2) < 120; LOG: statement: select count(*) from t1 where v1 > 10 or intrc(v2) < 120; LOG: Origin filter tree: @@ -656,8 +656,8 @@ NullTestNode(flags=129) 200 (1 row) -reset pax_log_filter_tree; -LOG: statement: reset pax_log_filter_tree; +reset pax.log_filter_tree; +LOG: statement: reset pax.log_filter_tree; reset client_min_messages; LOG: statement: reset client_min_messages; -- IN && bloom filter diff --git a/contrib/pax_storage/expected/filter_tree_optimizer_1.out b/contrib/pax_storage/expected/filter_tree_optimizer_1.out index a4c4c168664..19049ca4cd8 100644 --- a/contrib/pax_storage/expected/filter_tree_optimizer_1.out +++ b/contrib/pax_storage/expected/filter_tree_optimizer_1.out @@ -5,8 +5,8 @@ -- m/No sparse filter/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create or replace function intrc(iint int) returns int as $$ begin return iint; end; @@ -530,8 +530,8 @@ LOG: statement: select count(*) from t1 where intrc(v1) + 10 > v2; (1 row) -- simply the filter tree -set pax_log_filter_tree to on; -LOG: statement: set pax_log_filter_tree to on; +set pax.log_filter_tree to on; +LOG: statement: set pax.log_filter_tree to on; select count(*) from t1 where v1 > 10 or intrc(v2) < 120; LOG: statement: select count(*) from t1 where v1 > 10 or intrc(v2) < 120; LOG: Origin filter tree: @@ -664,8 +664,8 @@ NullTestNode(flags=129) 200 (1 row) -reset pax_log_filter_tree; -LOG: statement: reset pax_log_filter_tree; +reset pax.log_filter_tree; +LOG: statement: reset pax.log_filter_tree; reset client_min_messages; LOG: statement: reset client_min_messages; -- IN && bloom filter diff --git a/contrib/pax_storage/expected/filter_tree_root_quals.out b/contrib/pax_storage/expected/filter_tree_root_quals.out index c638550cb8a..6ca467e7142 100644 --- a/contrib/pax_storage/expected/filter_tree_root_quals.out +++ b/contrib/pax_storage/expected/filter_tree_root_quals.out @@ -5,8 +5,8 @@ -- m/No sparse filter/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create table t1(same int, v1 int, v2 int, v3 int, v4 int) using pax with (minmax_columns='v1,v2,v3,v4'); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'same' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. diff --git a/contrib/pax_storage/expected/filter_tree_root_quals_1.out b/contrib/pax_storage/expected/filter_tree_root_quals_1.out index 434eb449b59..d473544e73b 100644 --- a/contrib/pax_storage/expected/filter_tree_root_quals_1.out +++ b/contrib/pax_storage/expected/filter_tree_root_quals_1.out @@ -5,8 +5,8 @@ -- m/No sparse filter/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create table t1(same int, v1 int, v2 int, v3 int, v4 int) using pax with (minmax_columns='v1,v2,v3,v4'); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'same' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. diff --git a/contrib/pax_storage/expected/statistics/min_max_bit_byte_types.out b/contrib/pax_storage/expected/statistics/min_max_bit_byte_types.out index 11b333296dc..aca12f8eb6a 100644 --- a/contrib/pax_storage/expected/statistics/min_max_bit_byte_types.out +++ b/contrib/pax_storage/expected/statistics/min_max_bit_byte_types.out @@ -4,9 +4,9 @@ -- m/scan key build success/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the bool min/max types support -- @@ -609,6 +609,6 @@ kind group, filter rate: 1 / 4 reset client_min_messages; LOG: statement: reset client_min_messages; drop table t_varbit; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/statistics/min_max_float_types.out b/contrib/pax_storage/expected/statistics/min_max_float_types.out index c820fecf38c..620f8c1b9d4 100644 --- a/contrib/pax_storage/expected/statistics/min_max_float_types.out +++ b/contrib/pax_storage/expected/statistics/min_max_float_types.out @@ -4,9 +4,9 @@ -- m/scan key build success/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the float4 min/max types support -- @@ -809,6 +809,6 @@ LOG: kind file, filter rate: 0 / 2 reset client_min_messages; LOG: statement: reset client_min_messages; drop table t_float8; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/statistics/min_max_geo_types.out b/contrib/pax_storage/expected/statistics/min_max_geo_types.out index b067ed69207..2a1bf609fde 100644 --- a/contrib/pax_storage/expected/statistics/min_max_geo_types.out +++ b/contrib/pax_storage/expected/statistics/min_max_geo_types.out @@ -4,9 +4,9 @@ -- m/scan key build success/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the box min/max types support -- @@ -837,6 +837,6 @@ LOG: kind file, filter rate: 0 / 2 reset client_min_messages; LOG: statement: reset client_min_messages; drop table t_path; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/statistics/min_max_int_types.out b/contrib/pax_storage/expected/statistics/min_max_int_types.out index 1c08ce15288..c60fa8f247d 100644 --- a/contrib/pax_storage/expected/statistics/min_max_int_types.out +++ b/contrib/pax_storage/expected/statistics/min_max_int_types.out @@ -4,9 +4,9 @@ -- m/scan key build success/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the int2 min/max types support -- @@ -2015,6 +2015,6 @@ LOG: kind file, filter rate: 0 / 2 reset client_min_messages; LOG: statement: reset client_min_messages; drop table t_numeric; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/statistics/min_max_net_types.out b/contrib/pax_storage/expected/statistics/min_max_net_types.out index 97c288ee5ab..801ab2ca69e 100644 --- a/contrib/pax_storage/expected/statistics/min_max_net_types.out +++ b/contrib/pax_storage/expected/statistics/min_max_net_types.out @@ -4,9 +4,9 @@ -- m/scan key build success/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the inet min/max types support -- @@ -644,6 +644,6 @@ LOG: kind file, filter rate: 0 / 2 reset client_min_messages; LOG: statement: reset client_min_messages; drop table t_mac8; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/statistics/min_max_other_types.out b/contrib/pax_storage/expected/statistics/min_max_other_types.out index f62316c5f0e..bb79c1e3ee1 100644 --- a/contrib/pax_storage/expected/statistics/min_max_other_types.out +++ b/contrib/pax_storage/expected/statistics/min_max_other_types.out @@ -4,9 +4,9 @@ -- m/scan key build success/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the oid min/max types support -- @@ -637,6 +637,6 @@ LOG: kind file, filter rate: 0 / 2 reset client_min_messages; LOG: statement: reset client_min_messages; drop table t_uuid; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/statistics/min_max_text_types.out b/contrib/pax_storage/expected/statistics/min_max_text_types.out index 6e791f40cb2..119396957c5 100644 --- a/contrib/pax_storage/expected/statistics/min_max_text_types.out +++ b/contrib/pax_storage/expected/statistics/min_max_text_types.out @@ -5,9 +5,9 @@ -- m/Feature not supported/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the text min/max types support -- @@ -3210,6 +3210,6 @@ LOG: kind file, filter rate: 0 / 2 reset client_min_messages; LOG: statement: reset client_min_messages; drop table t_name; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/statistics/min_max_time_types.out b/contrib/pax_storage/expected/statistics/min_max_time_types.out index 6b15a011dde..c279e8207cb 100644 --- a/contrib/pax_storage/expected/statistics/min_max_time_types.out +++ b/contrib/pax_storage/expected/statistics/min_max_time_types.out @@ -4,9 +4,9 @@ -- m/scan key build success/ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the date min/max types support -- @@ -2485,6 +2485,6 @@ LOG: kind file, filter rate: 0 / 2 reset client_min_messages; LOG: statement: reset client_min_messages; drop table t_interval; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/statistics/statistics.out b/contrib/pax_storage/expected/statistics/statistics.out index 97b1a7bba9f..90a06b8cfd2 100644 --- a/contrib/pax_storage/expected/statistics/statistics.out +++ b/contrib/pax_storage/expected/statistics/statistics.out @@ -2,7 +2,7 @@ set default_table_access_method = pax; -- -- Test with small group -- -set pax_max_tuples_per_group = 10; +set pax.max_tuples_per_group = 10; -- test min/max type support create table t1(v1 int, v2 text, v3 float8, v4 bool) with (minmax_columns='v1,v2,v3,v4'); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'v1' as the Apache Cloudberry data distribution key for this table. @@ -186,7 +186,7 @@ select * from get_pax_aux_table('t_interval'); (3 rows) drop table t_interval; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; -- -- Test with small group -- @@ -347,7 +347,7 @@ drop table t_interval; -- -- Test the update/delete DML, stats should be updated -- -set pax_max_tuples_per_group = 250; +set pax.max_tuples_per_group = 250; -- delete part of data in the first group create table t1_update_stats(v1 int, v2 int, v3 int) with (minmax_columns='v1,v2'); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'v1' as the Apache Cloudberry data distribution key for this table. @@ -660,7 +660,7 @@ select * from get_pax_aux_table('t10_update_stats'); drop table t10_update_stats; -- delete twice -set pax_max_tuples_per_group = 25; +set pax.max_tuples_per_group = 25; create table t_delete_twice_stats(v1 int, v2 int, v3 int) with (minmax_columns='v2,v3'); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'v1' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. @@ -743,4 +743,4 @@ select * from get_pax_aux_table('t_update_twice_stats'); (3 rows) drop table t_update_twice_stats; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/statistics_bloom_filter.out b/contrib/pax_storage/expected/statistics_bloom_filter.out index bb2b4ae7c19..ec96bcfbc97 100644 --- a/contrib/pax_storage/expected/statistics_bloom_filter.out +++ b/contrib/pax_storage/expected/statistics_bloom_filter.out @@ -6,12 +6,12 @@ -- m/^LOG: Missing statistics for column: .*/ -- end_matchignore set default_table_access_method = pax; -set pax_enable_debug to on; +set pax.enable_debug to on; -- -- Test with small group -- -set pax_max_tuples_per_group = 10; -set pax_bloom_filter_work_memory_bytes = 102400; -- 100kb +set pax.max_tuples_per_group = 10; +set pax.bloom_filter_work_memory_bytes = 102400; -- 100kb -- create pax table with bloom filter reloptions create table t1(v1 int, v2 text, v3 varchar, v4 varchar(100), v5 bit, v6 float, v7 numeric, v8 numeric(20,10)) with (bloomfilter_columns='v1,v2,v3,v4,v5,v6,v7,v8'); @@ -20,7 +20,7 @@ create table t2(v1 int, v2 text, v3 varchar, v4 varchar(100), v5 bit, v6 float, drop table t1; drop table t2; -- test bloom filter(only work on IN case) -set pax_enable_sparse_filter to on; +set pax.enable_sparse_filter to on; -- the fixed length and type by value type create table t1(single_seg int, v1 int, v2 int) with (bloomfilter_columns='v1,v2'); insert into t1 values(1, generate_series(1, 100), generate_series(101, 200)); @@ -405,8 +405,8 @@ kind group, filter rate: 2 / 4 reset client_min_messages; drop table t3; -- test the big bloom filter -set pax_max_tuples_per_group to 16384; -set pax_max_tuples_per_file to 131072; +set pax.max_tuples_per_group to 16384; +set pax.max_tuples_per_file to 131072; create table t4(single_seg int, v1 varchar, v2 varchar) with (bloomfilter_columns='v1,v2'); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'single_seg' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. @@ -458,7 +458,7 @@ kind group, filter rate: 7 / 8 (0 rows) reset client_min_messages; -reset pax_bloom_filter_work_memory_bytes; -reset pax_max_tuples_per_group; -reset pax_max_tuples_per_file; -reset pax_enable_sparse_filter; +reset pax.bloom_filter_work_memory_bytes; +reset pax.max_tuples_per_group; +reset pax.max_tuples_per_file; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/expected/toast.out b/contrib/pax_storage/expected/toast.out index 3ec9dcd61ad..9306c12137f 100644 --- a/contrib/pax_storage/expected/toast.out +++ b/contrib/pax_storage/expected/toast.out @@ -1,5 +1,5 @@ set default_table_access_method = pax; -set pax_enable_toast to true; +set pax.enable_toast to true; CREATE TABLE pax_toasttest_t1(v1 text); alter TABLE pax_toasttest_t1 alter column v1 set storage plain; insert into pax_toasttest_t1 values(repeat('0', 100000000)); @@ -131,9 +131,9 @@ select length(v2), length(v4) from pax_toasttest_t2; drop table pax_toasttest_t2; ----------------------------------------------------------------------- --- a copy test cases that should be sucess when pax_enable_toast is off +-- a copy test cases that should be sucess when pax.enable_toast is off ----------------------------------------------------------------------- -set pax_enable_toast to false; +set pax.enable_toast to false; CREATE TABLE pax_toasttest_t1(v1 text); alter TABLE pax_toasttest_t1 alter column v1 set storage plain; insert into pax_toasttest_t1 values(repeat('0', 100000000)); diff --git a/contrib/pax_storage/expected/toast_failed.out b/contrib/pax_storage/expected/toast_failed.out index dfcd8a505b8..7e758f426df 100644 --- a/contrib/pax_storage/expected/toast_failed.out +++ b/contrib/pax_storage/expected/toast_failed.out @@ -1,5 +1,5 @@ set default_table_access_method = pax; -set pax_enable_toast to true; +set pax.enable_toast to true; -- test compress failed -- random varchar always make compress toast failed create or replace function random_string(integer) diff --git a/contrib/pax_storage/expected/visimap_vec_compact.out b/contrib/pax_storage/expected/visimap_vec_compact.out index e372f4b3054..b7c0a2b061f 100644 --- a/contrib/pax_storage/expected/visimap_vec_compact.out +++ b/contrib/pax_storage/expected/visimap_vec_compact.out @@ -2,7 +2,7 @@ -- so we can test multiple group with visimap, -- at the beginning, in the middle, or at the end -- of a group. -set pax_max_tuples_per_group = 5; +set pax.max_tuples_per_group = 5; -- column types contain: -- 1. normal fix-length column, like int -- 2. normal variable-length column, like text @@ -235,4 +235,4 @@ select * from pt1 where a = 1 and i >= 2 and i < 10; rollback; reset optimizer; drop table pt1; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/visimap_vec_compact_1.out b/contrib/pax_storage/expected/visimap_vec_compact_1.out index 1cd08636a70..e4424dbe3a1 100644 --- a/contrib/pax_storage/expected/visimap_vec_compact_1.out +++ b/contrib/pax_storage/expected/visimap_vec_compact_1.out @@ -2,7 +2,7 @@ -- so we can test multiple group with visimap, -- at the beginning, in the middle, or at the end -- of a group. -set pax_max_tuples_per_group = 5; +set pax.max_tuples_per_group = 5; -- column types contain: -- 1. normal fix-length column, like int -- 2. normal variable-length column, like text @@ -235,4 +235,4 @@ select * from pt1 where a = 1 and i >= 2 and i < 10; rollback; reset optimizer; drop table pt1; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/visimap_vec_storage.out b/contrib/pax_storage/expected/visimap_vec_storage.out index fbc3b4ef549..ab36db2e7e4 100644 --- a/contrib/pax_storage/expected/visimap_vec_storage.out +++ b/contrib/pax_storage/expected/visimap_vec_storage.out @@ -2,7 +2,7 @@ -- so we can test multiple group with visimap, -- at the beginning, in the middle, or at the end -- of a group. -set pax_max_tuples_per_group = 5; +set pax.max_tuples_per_group = 5; -- column types contain: -- 1. normal fix-length column, like int -- 2. normal variable-length column, like text @@ -235,4 +235,4 @@ select * from pt1 where a = 1 and i >= 2 and i < 10; rollback; reset optimizer; drop table pt1; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/expected/visimap_vec_storage_1.out b/contrib/pax_storage/expected/visimap_vec_storage_1.out index 336e1f94009..51068b7ec84 100644 --- a/contrib/pax_storage/expected/visimap_vec_storage_1.out +++ b/contrib/pax_storage/expected/visimap_vec_storage_1.out @@ -2,7 +2,7 @@ -- so we can test multiple group with visimap, -- at the beginning, in the middle, or at the end -- of a group. -set pax_max_tuples_per_group = 5; +set pax.max_tuples_per_group = 5; -- column types contain: -- 1. normal fix-length column, like int -- 2. normal variable-length column, like text @@ -235,4 +235,4 @@ select * from pt1 where a = 1 and i >= 2 and i < 10; rollback; reset optimizer; drop table pt1; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/cluster.sql b/contrib/pax_storage/sql/cluster.sql index 30da852522c..9b44fbdc6be 100644 --- a/contrib/pax_storage/sql/cluster.sql +++ b/contrib/pax_storage/sql/cluster.sql @@ -1,4 +1,4 @@ -set pax_max_tuples_per_file to 131072; +set pax.max_tuples_per_file to 131072; -- cluster table using index -- start_ignore @@ -63,7 +63,7 @@ select ptblockname,ptstatistics,ptisclustered from get_pax_aux_table('t_zorder_c drop table t_zorder_cluster; -- test cluster index -set pax_max_tuples_per_file to 131072; +set pax.max_tuples_per_file to 131072; drop table if EXISTS t_index_cluster; create table t_index_cluster(c1 int, c2 int) with (minmax_columns='c1,c2'); \d+ t_index_cluster; diff --git a/contrib/pax_storage/sql/filter.sql b/contrib/pax_storage/sql/filter.sql index c2548f50703..baf4d619c6a 100644 --- a/contrib/pax_storage/sql/filter.sql +++ b/contrib/pax_storage/sql/filter.sql @@ -1,5 +1,5 @@ -set pax_enable_debug to on; -set pax_enable_sparse_filter = on; +set pax.enable_debug to on; +set pax.enable_sparse_filter = on; create table pax_test.null_test_t(a int, b int, c text) using pax; insert into pax_test.null_test_t(a) select null from generate_series(1,2)i; @@ -73,4 +73,4 @@ select * from pax_test.in_test_t where h in ('bp_2', 'bp_4'); reset client_min_messages; drop table pax_test.in_test_t; -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/sql/filter_tree.sql b/contrib/pax_storage/sql/filter_tree.sql index 04f8cbce942..81b350daa1e 100644 --- a/contrib/pax_storage/sql/filter_tree.sql +++ b/contrib/pax_storage/sql/filter_tree.sql @@ -7,8 +7,8 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create or replace function intrc(iint int) returns int as $$ @@ -132,11 +132,11 @@ select count(*) from t1 where v1 + v2 < v3; select count(*) from t1 where intrc(v1) + 10 > v2; -- simply the filter tree -set pax_log_filter_tree to on; +set pax.log_filter_tree to on; select count(*) from t1 where v1 > 10 or intrc(v2) < 120; select count(*) from t1 where v1 > 10 and intrc(v2) < 120; select count(*) from t1 where v1 is not null; -reset pax_log_filter_tree; +reset pax.log_filter_tree; reset client_min_messages; diff --git a/contrib/pax_storage/sql/filter_tree_arithmetic.sql b/contrib/pax_storage/sql/filter_tree_arithmetic.sql index 40339ecc393..deb136c9e29 100644 --- a/contrib/pax_storage/sql/filter_tree_arithmetic.sql +++ b/contrib/pax_storage/sql/filter_tree_arithmetic.sql @@ -7,8 +7,8 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create table t_arithmetic(same int, v1 int, v2 int, v3 int) using pax with (minmax_columns='v1,v2,v3'); create table ta_mul(same int, v1 int, v2 int, v3 int) using pax with (minmax_columns='v1,v2,v3'); diff --git a/contrib/pax_storage/sql/filter_tree_root_quals.sql b/contrib/pax_storage/sql/filter_tree_root_quals.sql index 523f7a67f43..f4d4880f3bf 100644 --- a/contrib/pax_storage/sql/filter_tree_root_quals.sql +++ b/contrib/pax_storage/sql/filter_tree_root_quals.sql @@ -6,8 +6,8 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; create table t1(same int, v1 int, v2 int, v3 int, v4 int) using pax with (minmax_columns='v1,v2,v3,v4'); create table t2(same int, v1 int, v2 int, v3 int, v4 int) using pax with (minmax_columns='v1,v2,v3,v4'); diff --git a/contrib/pax_storage/sql/statistics/min_max_bit_byte_types.sql b/contrib/pax_storage/sql/statistics/min_max_bit_byte_types.sql index 73bc0a26596..a9d5822c6e6 100644 --- a/contrib/pax_storage/sql/statistics/min_max_bit_byte_types.sql +++ b/contrib/pax_storage/sql/statistics/min_max_bit_byte_types.sql @@ -5,9 +5,9 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the bool min/max types support @@ -151,8 +151,8 @@ select count(*) from t_varbit where v = '1'::bit; reset client_min_messages; drop table t_varbit; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/statistics/min_max_float_types.sql b/contrib/pax_storage/sql/statistics/min_max_float_types.sql index db17cf105ab..1ece90d9e4e 100644 --- a/contrib/pax_storage/sql/statistics/min_max_float_types.sql +++ b/contrib/pax_storage/sql/statistics/min_max_float_types.sql @@ -5,9 +5,9 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the float4 min/max types support @@ -156,6 +156,6 @@ select count(*) from t_float8 where v <= 20::float4; reset client_min_messages; drop table t_float8; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/statistics/min_max_geo_types.sql b/contrib/pax_storage/sql/statistics/min_max_geo_types.sql index 02eb4929e76..575f0fe6721 100644 --- a/contrib/pax_storage/sql/statistics/min_max_geo_types.sql +++ b/contrib/pax_storage/sql/statistics/min_max_geo_types.sql @@ -5,9 +5,9 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the box min/max types support @@ -194,6 +194,6 @@ select count(*) from t_path where v <= path'(0,0),(1,1),(2,20)'; reset client_min_messages; drop table t_path; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/statistics/min_max_int_types.sql b/contrib/pax_storage/sql/statistics/min_max_int_types.sql index 68d9118f102..abcd1eb6374 100644 --- a/contrib/pax_storage/sql/statistics/min_max_int_types.sql +++ b/contrib/pax_storage/sql/statistics/min_max_int_types.sql @@ -5,9 +5,9 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the int2 min/max types support @@ -366,6 +366,6 @@ select count(*) from t_numeric where v <= 20::numeric; reset client_min_messages; drop table t_numeric; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/statistics/min_max_net_types.sql b/contrib/pax_storage/sql/statistics/min_max_net_types.sql index fa46533689b..ceb19783ca6 100644 --- a/contrib/pax_storage/sql/statistics/min_max_net_types.sql +++ b/contrib/pax_storage/sql/statistics/min_max_net_types.sql @@ -5,9 +5,9 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the inet min/max types support @@ -154,6 +154,6 @@ select count(*) from t_mac8 where v <= '12:34:56:00:00:00:00:20'::macaddr8; reset client_min_messages; drop table t_mac8; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/statistics/min_max_other_types.sql b/contrib/pax_storage/sql/statistics/min_max_other_types.sql index 1ebc0a3a504..79e93ccf83f 100644 --- a/contrib/pax_storage/sql/statistics/min_max_other_types.sql +++ b/contrib/pax_storage/sql/statistics/min_max_other_types.sql @@ -5,9 +5,9 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the oid min/max types support @@ -145,6 +145,6 @@ select count(*) from t_uuid where v <= 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a30'::u reset client_min_messages; drop table t_uuid; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/statistics/min_max_text_types.sql b/contrib/pax_storage/sql/statistics/min_max_text_types.sql index 66e7ed91b21..0dfd4c9f11b 100644 --- a/contrib/pax_storage/sql/statistics/min_max_text_types.sql +++ b/contrib/pax_storage/sql/statistics/min_max_text_types.sql @@ -6,9 +6,9 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the text min/max types support @@ -590,6 +590,6 @@ select count(*) from t_name where v <= 'pft'::varchar; reset client_min_messages; drop table t_name; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/statistics/min_max_time_types.sql b/contrib/pax_storage/sql/statistics/min_max_time_types.sql index 3a8c9fab872..710b5141926 100644 --- a/contrib/pax_storage/sql/statistics/min_max_time_types.sql +++ b/contrib/pax_storage/sql/statistics/min_max_time_types.sql @@ -5,9 +5,9 @@ -- end_matchignore set default_table_access_method to pax; -set pax_enable_debug to on; -set pax_enable_sparse_filter to on; -set pax_max_tuples_per_group to 5; +set pax.enable_debug to on; +set pax.enable_sparse_filter to on; +set pax.max_tuples_per_group to 5; -- -- Test the date min/max types support @@ -501,6 +501,6 @@ select count(*) from t_interval where v <= '20 second'::interval; reset client_min_messages; drop table t_interval; -reset pax_enable_debug; -reset pax_enable_sparse_filter; -reset pax_max_tuples_per_group; +reset pax.enable_debug; +reset pax.enable_sparse_filter; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/statistics/statistics.sql b/contrib/pax_storage/sql/statistics/statistics.sql index 7df0302cc67..00a9337e91d 100644 --- a/contrib/pax_storage/sql/statistics/statistics.sql +++ b/contrib/pax_storage/sql/statistics/statistics.sql @@ -4,7 +4,7 @@ set default_table_access_method = pax; -- -- Test with small group -- -set pax_max_tuples_per_group = 10; +set pax.max_tuples_per_group = 10; -- test min/max type support create table t1(v1 int, v2 text, v3 float8, v4 bool) with (minmax_columns='v1,v2,v3,v4'); @@ -87,7 +87,7 @@ insert into t_interval values select * from get_pax_aux_table('t_interval'); drop table t_interval; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; -- -- Test with small group @@ -162,7 +162,7 @@ drop table t_interval; -- -- Test the update/delete DML, stats should be updated -- -set pax_max_tuples_per_group = 250; +set pax.max_tuples_per_group = 250; -- delete part of data in the first group create table t1_update_stats(v1 int, v2 int, v3 int) with (minmax_columns='v1,v2'); @@ -276,7 +276,7 @@ drop table t10_update_stats; -- delete twice -set pax_max_tuples_per_group = 25; +set pax.max_tuples_per_group = 25; create table t_delete_twice_stats(v1 int, v2 int, v3 int) with (minmax_columns='v2,v3'); insert into t_delete_twice_stats values(1, generate_series(1, 100), generate_series(101, 200)); select * from get_pax_aux_table('t_delete_twice_stats'); @@ -304,4 +304,4 @@ select sum(v2), sum(v3) from t_update_twice_stats; select * from get_pax_aux_table('t_update_twice_stats'); drop table t_update_twice_stats; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/statistics_bloom_filter.sql b/contrib/pax_storage/sql/statistics_bloom_filter.sql index 53cfbf9418c..a19f8b1bd83 100644 --- a/contrib/pax_storage/sql/statistics_bloom_filter.sql +++ b/contrib/pax_storage/sql/statistics_bloom_filter.sql @@ -6,12 +6,12 @@ -- m/^LOG: Missing statistics for column: .*/ -- end_matchignore set default_table_access_method = pax; -set pax_enable_debug to on; +set pax.enable_debug to on; -- -- Test with small group -- -set pax_max_tuples_per_group = 10; -set pax_bloom_filter_work_memory_bytes = 102400; -- 100kb +set pax.max_tuples_per_group = 10; +set pax.bloom_filter_work_memory_bytes = 102400; -- 100kb -- create pax table with bloom filter reloptions create table t1(v1 int, v2 text, v3 varchar, v4 varchar(100), v5 bit, v6 float, v7 numeric, v8 numeric(20,10)) @@ -23,7 +23,7 @@ drop table t1; drop table t2; -- test bloom filter(only work on IN case) -set pax_enable_sparse_filter to on; +set pax.enable_sparse_filter to on; -- the fixed length and type by value type create table t1(single_seg int, v1 int, v2 int) with (bloomfilter_columns='v1,v2'); @@ -124,8 +124,8 @@ reset client_min_messages; drop table t3; -- test the big bloom filter -set pax_max_tuples_per_group to 16384; -set pax_max_tuples_per_file to 131072; +set pax.max_tuples_per_group to 16384; +set pax.max_tuples_per_file to 131072; create table t4(single_seg int, v1 varchar, v2 varchar) with (bloomfilter_columns='v1,v2'); insert into t4 values(1, generate_series(1, 1000000), generate_series(1000001, 2000000)); @@ -140,7 +140,7 @@ select * from t4 where v1 > '1' and v2 in ('8', '1000009'); select * from t4 where v1 in ('8', '1000009') and v2 in ('8', '1000009'); reset client_min_messages; -reset pax_bloom_filter_work_memory_bytes; -reset pax_max_tuples_per_group; -reset pax_max_tuples_per_file; -reset pax_enable_sparse_filter; +reset pax.bloom_filter_work_memory_bytes; +reset pax.max_tuples_per_group; +reset pax.max_tuples_per_file; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/sql/toast.sql b/contrib/pax_storage/sql/toast.sql index c4c920617d1..3a8b267cc19 100644 --- a/contrib/pax_storage/sql/toast.sql +++ b/contrib/pax_storage/sql/toast.sql @@ -1,6 +1,6 @@ set default_table_access_method = pax; -set pax_enable_toast to true; +set pax.enable_toast to true; CREATE TABLE pax_toasttest_t1(v1 text); alter TABLE pax_toasttest_t1 alter column v1 set storage plain; insert into pax_toasttest_t1 values(repeat('0', 100000000)); @@ -48,9 +48,9 @@ select length(v2), length(v4) from pax_toasttest_t2; drop table pax_toasttest_t2; ----------------------------------------------------------------------- --- a copy test cases that should be sucess when pax_enable_toast is off +-- a copy test cases that should be sucess when pax.enable_toast is off ----------------------------------------------------------------------- -set pax_enable_toast to false; +set pax.enable_toast to false; CREATE TABLE pax_toasttest_t1(v1 text); alter TABLE pax_toasttest_t1 alter column v1 set storage plain; insert into pax_toasttest_t1 values(repeat('0', 100000000)); diff --git a/contrib/pax_storage/sql/toast_failed.sql b/contrib/pax_storage/sql/toast_failed.sql index f02749586d3..c9139b474ce 100644 --- a/contrib/pax_storage/sql/toast_failed.sql +++ b/contrib/pax_storage/sql/toast_failed.sql @@ -1,6 +1,6 @@ set default_table_access_method = pax; -set pax_enable_toast to true; +set pax.enable_toast to true; -- test compress failed -- random varchar always make compress toast failed diff --git a/contrib/pax_storage/sql/visimap_vec_compact.sql b/contrib/pax_storage/sql/visimap_vec_compact.sql index 6454c6f0b63..2989e40b3a2 100644 --- a/contrib/pax_storage/sql/visimap_vec_compact.sql +++ b/contrib/pax_storage/sql/visimap_vec_compact.sql @@ -2,7 +2,7 @@ -- so we can test multiple group with visimap, -- at the beginning, in the middle, or at the end -- of a group. -set pax_max_tuples_per_group = 5; +set pax.max_tuples_per_group = 5; -- column types contain: -- 1. normal fix-length column, like int @@ -78,4 +78,4 @@ reset optimizer; drop table pt1; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/sql/visimap_vec_storage.sql b/contrib/pax_storage/sql/visimap_vec_storage.sql index eac6eddc55e..daf8dec1ff9 100644 --- a/contrib/pax_storage/sql/visimap_vec_storage.sql +++ b/contrib/pax_storage/sql/visimap_vec_storage.sql @@ -2,7 +2,7 @@ -- so we can test multiple group with visimap, -- at the beginning, in the middle, or at the end -- of a group. -set pax_max_tuples_per_group = 5; +set pax.max_tuples_per_group = 5; -- column types contain: -- 1. normal fix-length column, like int @@ -78,4 +78,4 @@ reset optimizer; drop table pt1; -reset pax_max_tuples_per_group; +reset pax.max_tuples_per_group; diff --git a/contrib/pax_storage/src/api/python3/paxfilereader_type.cc b/contrib/pax_storage/src/api/python3/paxfilereader_type.cc index 3731ab613a5..262c49e592a 100644 --- a/contrib/pax_storage/src/api/python3/paxfilereader_type.cc +++ b/contrib/pax_storage/src/api/python3/paxfilereader_type.cc @@ -129,7 +129,7 @@ static int paxfilereader_init(PyObject *self, PyObject *args, PyObject *schema = NULL, *proj = NULL, *pax_file = NULL; PaxFileObject *pax_file_obj; std::shared_ptr visible_map_bm = nullptr; - std::shared_ptr toast_file = nullptr; + std::unique_ptr toast_file = nullptr; PaxFileReaderObject *pax_file_reader; pax_file_reader = (PaxFileReaderObject *)self; @@ -227,7 +227,7 @@ static int paxfilereader_init(PyObject *self, PyObject *args, auto file_ptr = pax::Singleton::GetInstance()->Open( pax_file_obj->filepath, pax::fs::kReadMode); - auto reader = new pax::OrcReader(std::move(file_ptr), toast_file); + auto reader = new pax::OrcReader(std::move(file_ptr), std::move(toast_file)); reader->Open(std::move(read_options)); pax_file_reader->reader = reader; } catch (cbdb::CException &e) { @@ -345,7 +345,6 @@ static PyObject *paxfilereader_readgroup(PaxFileReaderObject *self, try { for (; column_index < col_nums; column_index++) { const auto &column = (*columns)[column_index]; - std::shared_ptr bm; auto null_counts = 0; PyObject *schema_item = nullptr; long col_oid; @@ -373,7 +372,7 @@ static PyObject *paxfilereader_readgroup(PaxFileReaderObject *self, continue; } - bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); py_rows = PyList_New(0); if (!py_rows) { @@ -403,7 +402,7 @@ static PyObject *paxfilereader_readgroup(PaxFileReaderObject *self, if (column->IsToast(row_index)) { // safe to no keep the ref, because paxbuffer_to_pyobj will copy the // detoast datum - std::shared_ptr ref = nullptr; + std::unique_ptr ref = nullptr; auto datum = PointerGetDatum(buff); auto external_buffer = column->GetExternalToastDataBuffer(); diff --git a/contrib/pax_storage/src/api/python3/setup-debug.py b/contrib/pax_storage/src/api/python3/setup-debug.py index 13eadaa357d..da02e2ab237 100644 --- a/contrib/pax_storage/src/api/python3/setup-debug.py +++ b/contrib/pax_storage/src/api/python3/setup-debug.py @@ -51,7 +51,7 @@ def abs_path(file_name): version = '1.0', description = 'PAXPY is the PYTHON3 API of PAX', author = 'jiaqizho', - author_email = 'jiaqizho@hashdata.cn', + author_email = 'jiaqizho@apache.org', url = '-', ext_modules = [paxpy_module] -) \ No newline at end of file +) diff --git a/contrib/pax_storage/src/api/python3/setup.py b/contrib/pax_storage/src/api/python3/setup.py index a60cc779fd9..b739cacc465 100644 --- a/contrib/pax_storage/src/api/python3/setup.py +++ b/contrib/pax_storage/src/api/python3/setup.py @@ -48,7 +48,7 @@ def abs_path(file_name): version = '1.0', description = 'PAXPY is the PYTHON3 API of PAX', author = 'jiaqizho', - author_email = 'jiaqizho@hashdata.cn', + author_email = 'jiaqizho@apache.org', url = '-', ext_modules = [paxpy_module] -) \ No newline at end of file +) diff --git a/contrib/pax_storage/src/cpp/access/pax_access_handle.cc b/contrib/pax_storage/src/cpp/access/pax_access_handle.cc index da07cddd5d7..12caf0e9f64 100644 --- a/contrib/pax_storage/src/cpp/access/pax_access_handle.cc +++ b/contrib/pax_storage/src/cpp/access/pax_access_handle.cc @@ -412,8 +412,6 @@ void CCPaxAccessMethod::FinishBulkInsert(Relation relation, int options) { } void CCPaxAccessMethod::ExtDmlInit(Relation rel, CmdType operation) { - if (!RELATION_IS_PAX(rel)) return; - CBDB_TRY(); { pax::CPaxDmlStateLocal::Instance()->InitDmlState(rel, operation); } CBDB_CATCH_DEFAULT(); @@ -422,8 +420,6 @@ void CCPaxAccessMethod::ExtDmlInit(Relation rel, CmdType operation) { } void CCPaxAccessMethod::ExtDmlFini(Relation rel, CmdType operation) { - if (!RELATION_IS_PAX(rel)) return; - CBDB_TRY(); { pax::CPaxDmlStateLocal::Instance()->FinishDmlState(rel, operation); } CBDB_CATCH_DEFAULT(); @@ -453,6 +449,7 @@ uint32 PaxAccessMethod::ScanFlags(Relation relation) { flags |= SCAN_FORCE_BIG_WRITE_LOCK; #endif + flags |= SCAN_SUPPORT_RUNTIME_FILTER; return flags; } @@ -790,6 +787,8 @@ static const TableAmRoutine kPaxColumnMethods = { .scan_sample_next_block = pax::CCPaxAccessMethod::ScanSampleNextBlock, .scan_sample_next_tuple = pax::CCPaxAccessMethod::ScanSampleNextTuple, + .dml_init = pax::CCPaxAccessMethod::ExtDmlInit, + .dml_fini = pax::CCPaxAccessMethod::ExtDmlFini, .amoptions = paxc::PaxAccessMethod::AmOptions, .swap_relation_files = paxc::PaxAccessMethod::SwapRelationFiles, .validate_column_encoding_clauses = @@ -1190,9 +1189,6 @@ void _PG_init(void) { // NOLINT prev_object_access_hook = object_access_hook; object_access_hook = PaxObjectAccessHook; - ext_dml_init_hook = pax::CCPaxAccessMethod::ExtDmlInit; - ext_dml_finish_hook = pax::CCPaxAccessMethod::ExtDmlFini; - prev_ProcessUtilit_hook = ProcessUtility_hook; ProcessUtility_hook = paxProcessUtility; diff --git a/contrib/pax_storage/src/cpp/access/pax_access_handle.h b/contrib/pax_storage/src/cpp/access/pax_access_handle.h index d541a400d2b..9ed827b4709 100644 --- a/contrib/pax_storage/src/cpp/access/pax_access_handle.h +++ b/contrib/pax_storage/src/cpp/access/pax_access_handle.h @@ -197,6 +197,3 @@ class CCPaxAccessMethod final { }; } // namespace pax - -extern ext_dml_func_hook_type ext_dml_init_hook; -extern ext_dml_func_hook_type ext_dml_finish_hook; diff --git a/contrib/pax_storage/src/cpp/access/pax_dml_state.cc b/contrib/pax_storage/src/cpp/access/pax_dml_state.cc index 0a7d367d34f..40cb6bdcdcd 100644 --- a/contrib/pax_storage/src/cpp/access/pax_dml_state.cc +++ b/contrib/pax_storage/src/cpp/access/pax_dml_state.cc @@ -104,7 +104,7 @@ void CPaxDmlStateLocal::Reset() { cbdb::pax_memory_context = nullptr; } CPaxDmlStateLocal::CPaxDmlStateLocal() : last_oid_(InvalidOid), cb_{.func = DmlStateResetCallback, .arg = NULL} {} -std::shared_ptr +pg_attribute_always_inline std::shared_ptr CPaxDmlStateLocal::RemoveDmlState(const Oid &oid) { std::shared_ptr value; @@ -121,7 +121,7 @@ CPaxDmlStateLocal::RemoveDmlState(const Oid &oid) { return value; } -std::shared_ptr +pg_attribute_always_inline std::shared_ptr CPaxDmlStateLocal::FindDmlState(const Oid &oid) { Assert(OidIsValid(oid)); diff --git a/contrib/pax_storage/src/cpp/access/pax_scanner.cc b/contrib/pax_storage/src/cpp/access/pax_scanner.cc index a5e0b632002..5a354e6fa0c 100644 --- a/contrib/pax_storage/src/cpp/access/pax_scanner.cc +++ b/contrib/pax_storage/src/cpp/access/pax_scanner.cc @@ -218,7 +218,7 @@ bool PaxScanDesc::BitmapNextTuple(struct TBMIterateResult *tbmres, } TableScanDesc PaxScanDesc::BeginScan(Relation relation, Snapshot snapshot, - int nkeys, struct ScanKeyData * /*key*/, + int nkeys, struct ScanKeyData *key, ParallelTableScanDesc pscan, uint32 flags, std::shared_ptr &&pax_filter, bool build_bitmap) { @@ -326,8 +326,8 @@ void PaxScanDesc::EndScan() { } TableScanDesc PaxScanDesc::BeginScanExtractColumns( - Relation rel, Snapshot snapshot, int /*nkeys*/, - struct ScanKeyData * /*key*/, ParallelTableScanDesc parallel_scan, + Relation rel, Snapshot snapshot, int nkeys, + struct ScanKeyData *key, ParallelTableScanDesc parallel_scan, struct PlanState *ps, uint32 flags) { std::shared_ptr filter; List *targetlist = ps->plan->targetlist; @@ -361,7 +361,7 @@ TableScanDesc PaxScanDesc::BeginScanExtractColumns( filter->SetColumnProjection(std::move(col_bits)); if (pax_enable_sparse_filter) { - filter->InitSparseFilter(rel, qual); + filter->InitSparseFilter(rel, qual, key, nkeys); // FIXME: enable predicate pushdown can filter rows immediately without // assigning all columns. But it may mess the filter orders for multiple @@ -375,7 +375,7 @@ TableScanDesc PaxScanDesc::BeginScanExtractColumns( filter->InitRowFilter(rel, ps, filter->GetColumnProjection()); } } - return BeginScan(rel, snapshot, 0, nullptr, parallel_scan, flags, + return BeginScan(rel, snapshot, nkeys, key, parallel_scan, flags, std::move(filter), build_bitmap); } diff --git a/contrib/pax_storage/src/cpp/access/pax_visimap.cc b/contrib/pax_storage/src/cpp/access/pax_visimap.cc index b96752d31ba..cdc7dfe763e 100644 --- a/contrib/pax_storage/src/cpp/access/pax_visimap.cc +++ b/contrib/pax_storage/src/cpp/access/pax_visimap.cc @@ -143,8 +143,7 @@ bool TestVisimap(Relation rel, const char *visimap_name, int offset) { fs = Singleton::GetInstance(); auto visimap = LoadVisimap(fs, options, file_path); - auto bm = Bitmap8(BitmapRaw(visimap->data(), visimap->size()), - Bitmap8::ReadOnlyOwnBitmap); + auto bm = Bitmap8(BitmapRaw(visimap->data(), visimap->size())); auto is_set = bm.Test(offset); return !is_set; } diff --git a/contrib/pax_storage/src/cpp/access/paxc_rel_options.cc b/contrib/pax_storage/src/cpp/access/paxc_rel_options.cc index 647cb5743cf..5de1b14cd97 100644 --- a/contrib/pax_storage/src/cpp/access/paxc_rel_options.cc +++ b/contrib/pax_storage/src/cpp/access/paxc_rel_options.cc @@ -50,6 +50,10 @@ static const relopt_compress_type_mapping kSelfRelCompressMap[] = { pax::ColumnEncoding_Kind::ColumnEncoding_Kind_COMPRESS_ZSTD}, {ColumnEncoding_Kind_COMPRESS_ZLIB_STR, pax::ColumnEncoding_Kind::ColumnEncoding_Kind_COMPRESS_ZLIB}, +#ifdef USE_LZ4 + {ColumnEncoding_Kind_COMPRESS_LZ4_STR, + pax::ColumnEncoding_Kind::ColumnEncoding_Kind_COMPRESS_LZ4}, +#endif }; typedef struct { diff --git a/contrib/pax_storage/src/cpp/access/paxc_rel_options.h b/contrib/pax_storage/src/cpp/access/paxc_rel_options.h index 4e813f38c40..e6c29363ab1 100644 --- a/contrib/pax_storage/src/cpp/access/paxc_rel_options.h +++ b/contrib/pax_storage/src/cpp/access/paxc_rel_options.h @@ -41,6 +41,7 @@ namespace paxc { #define ColumnEncoding_Kind_DICTIONARY_STR "dict" #define ColumnEncoding_Kind_COMPRESS_ZSTD_STR "zstd" #define ColumnEncoding_Kind_COMPRESS_ZLIB_STR "zlib" +#define ColumnEncoding_Kind_COMPRESS_LZ4_STR "lz4" #define STORAGE_FORMAT_TYPE_PORC "porc" #define STORAGE_FORMAT_TYPE_PORC_VEC "porc_vec" diff --git a/contrib/pax_storage/src/cpp/catalog/manifest_api.h b/contrib/pax_storage/src/cpp/catalog/manifest_api.h index f9e8f7c9eb1..3978caf9097 100644 --- a/contrib/pax_storage/src/cpp/catalog/manifest_api.h +++ b/contrib/pax_storage/src/cpp/catalog/manifest_api.h @@ -28,7 +28,7 @@ #pragma once /* - * abstract interface of hashdata manifest + * abstract interface of manifest */ #ifdef __cplusplus @@ -42,7 +42,7 @@ extern "C" { * memory management. */ -/* hashdata manifest support following data types for fields */ +/* manifest support following data types for fields */ typedef enum MetaFieldType { Meta_Field_Type_Int = 1, diff --git a/contrib/pax_storage/src/cpp/catalog/pax_aux_table.cc b/contrib/pax_storage/src/cpp/catalog/pax_aux_table.cc index baca9efe47d..e655f469bc3 100644 --- a/contrib/pax_storage/src/cpp/catalog/pax_aux_table.cc +++ b/contrib/pax_storage/src/cpp/catalog/pax_aux_table.cc @@ -700,7 +700,7 @@ pax::MicroPartitionMetadata PaxGetMicroPartitionMetadata(Relation rel, paxc::FetchMicroPartitionAuxRow(rel, snapshot, block_id, FetchMicroPartitionAuxRowCallbackWrapper, &ctx); - return std::move(ctx.info); + return ctx.info; } CBDB_WRAP_END; } diff --git a/contrib/pax_storage/src/cpp/clustering/pax_clustering_reader.cc b/contrib/pax_storage/src/cpp/clustering/pax_clustering_reader.cc index 536743fd722..eb74f515d85 100644 --- a/contrib/pax_storage/src/cpp/clustering/pax_clustering_reader.cc +++ b/contrib/pax_storage/src/cpp/clustering/pax_clustering_reader.cc @@ -63,8 +63,8 @@ bool PaxClusteringReader::GetNextTuple(TupleTableSlot *slot) { file->Close(); } - std::shared_ptr file; - std::shared_ptr toast_file; + std::unique_ptr file; + std::unique_ptr toast_file; file = file_system_->Open(meta_info.GetFileName(), pax::fs::kReadMode); @@ -75,7 +75,7 @@ bool PaxClusteringReader::GetNextTuple(TupleTableSlot *slot) { } reader_ = MicroPartitionFileFactory::CreateMicroPartitionReader( - options, ReaderFlags::FLAGS_EMPTY, file, toast_file); + options, ReaderFlags::FLAGS_EMPTY, std::move(file), std::move(toast_file)); } else { return false; } diff --git a/contrib/pax_storage/src/cpp/cmake/pax.cmake b/contrib/pax_storage/src/cpp/cmake/pax.cmake index 71775bac2dd..528b4e8cafc 100644 --- a/contrib/pax_storage/src/cpp/cmake/pax.cmake +++ b/contrib/pax_storage/src/cpp/cmake/pax.cmake @@ -30,6 +30,7 @@ set(pax_comm_src comm/bitmap.cc comm/bloomfilter.cc comm/byte_buffer.cc + comm/fast_io.cc comm/guc.cc comm/paxc_wrappers.cc comm/pax_memory.cc @@ -51,6 +52,7 @@ set(pax_storage_src storage/columns/pax_dict_encoding.cc storage/columns/pax_decoding.cc storage/columns/pax_encoding.cc + storage/columns/pax_delta_encoding.cc storage/columns/pax_rlev2_decoding.cc storage/columns/pax_rlev2_encoding.cc storage/columns/pax_vec_bitpacked_column.cc @@ -172,7 +174,7 @@ add_subdirectory(contrib/tabulate) set(pax_target_src ${PROTO_SRCS} ${pax_storage_src} ${pax_clustering_src} ${pax_exceptions_src} ${pax_access_src} ${pax_comm_src} ${pax_catalog_src} ${pax_vec_src}) set(pax_target_include ${pax_target_include} ${ZTSD_HEADER} ${CMAKE_CURRENT_SOURCE_DIR} ${CBDB_INCLUDE_DIR} contrib/tabulate/include) -set(pax_target_link_libs ${pax_target_link_libs} protobuf zstd z postgres) +set(pax_target_link_libs ${pax_target_link_libs} protobuf zstd z postgres uring) if (PAX_USE_LZ4) list(APPEND pax_target_link_libs lz4) endif() @@ -206,7 +208,7 @@ endif(VEC_BUILD) target_include_directories(pax PUBLIC ${pax_target_include}) target_link_directories(pax PUBLIC ${pax_target_link_directories}) -target_link_libraries(pax PUBLIC ${pax_target_link_libs}) +target_link_libraries(pax PRIVATE ${pax_target_link_libs}) set_target_properties(pax PROPERTIES BUILD_RPATH_USE_ORIGIN ON BUILD_WITH_INSTALL_RPATH ON @@ -232,8 +234,8 @@ if (BUILD_GTEST) add_dependencies(test_main ${pax_target_dependencies} gtest gmock) target_include_directories(test_main PUBLIC ${pax_target_include} ${CMAKE_CURRENT_SOURCE_DIR} ${gtest_SOURCE_DIR}/include contrib/cpp-stub/src/ contrib/cpp-stub/src_linux/) - target_link_directories(test_main PUBLIC ${pax_target_link_directories}) - target_link_libraries(test_main PUBLIC ${pax_target_link_libs} gtest gmock postgres) + target_link_directories(test_main PRIVATE ${pax_target_link_directories}) + target_link_libraries(test_main PRIVATE ${pax_target_link_libs} gtest gmock postgres) endif(BUILD_GTEST) if(BUILD_GBENCH) diff --git a/contrib/pax_storage/src/cpp/cmake/pax_format.cmake b/contrib/pax_storage/src/cpp/cmake/pax_format.cmake index 4bdc25671f9..8d28e793d27 100644 --- a/contrib/pax_storage/src/cpp/cmake/pax_format.cmake +++ b/contrib/pax_storage/src/cpp/cmake/pax_format.cmake @@ -20,6 +20,7 @@ set(pax_comm_src comm/bitmap.cc comm/bloomfilter.cc comm/byte_buffer.cc + comm/fast_io.cc comm/guc.cc comm/paxc_wrappers.cc comm/pax_memory.cc @@ -41,6 +42,7 @@ set(pax_storage_src storage/columns/pax_dict_encoding.cc storage/columns/pax_decoding.cc storage/columns/pax_encoding.cc + storage/columns/pax_delta_encoding.cc storage/columns/pax_rlev2_decoding.cc storage/columns/pax_rlev2_encoding.cc storage/columns/pax_vec_column.cc @@ -107,7 +109,7 @@ set(pax_vec_src ${pax_vec_src} endif() set(pax_target_include ${ZTSD_HEADER} ${CMAKE_CURRENT_SOURCE_DIR} ${CBDB_INCLUDE_DIR} contrib/tabulate/include) -set(pax_target_link_libs uuid protobuf zstd z) +set(pax_target_link_libs uuid protobuf zstd z uring) if (PAX_USE_LZ4) list(APPEND pax_target_link_libs lz4) endif() @@ -134,7 +136,7 @@ endif(VEC_BUILD) add_library(paxformat SHARED ${PROTO_SRCS} ${pax_storage_src} ${pax_clustering_src} ${pax_exceptions_src} ${pax_comm_src} ${pax_vec_src}) target_include_directories(paxformat PUBLIC ${pax_target_include}) target_link_directories(paxformat PUBLIC ${pax_target_link_directories}) -target_link_libraries(paxformat PUBLIC ${pax_target_link_libs}) +target_link_libraries(paxformat PRIVATE ${pax_target_link_libs}) set_target_properties(paxformat PROPERTIES OUTPUT_NAME paxformat) @@ -195,4 +197,4 @@ install(TARGETS paxformat add_executable(paxformat_test paxformat_test.cc) target_include_directories(paxformat_test PUBLIC ${pax_target_include} ${CMAKE_CURRENT_SOURCE_DIR}) add_dependencies(paxformat_test paxformat) -target_link_libraries(paxformat_test PUBLIC paxformat postgres) +target_link_libraries(paxformat_test PRIVATE paxformat postgres) diff --git a/contrib/pax_storage/src/cpp/comm/bitmap.h b/contrib/pax_storage/src/cpp/comm/bitmap.h index 1fc831f1787..4e811dddc8e 100644 --- a/contrib/pax_storage/src/cpp/comm/bitmap.h +++ b/contrib/pax_storage/src/cpp/comm/bitmap.h @@ -134,12 +134,28 @@ struct BitmapRaw final { static_assert(BM_WORD_BITS == (1 << BM_WORD_SHIFTS)); return (index >> BM_WORD_SHIFTS) < size; } - inline bool Empty() const { + + inline bool Empty(uint32 end_index) const { if (!bitmap) return true; - for (size_t i = 0; i < size; i++) - if (bitmap[i]) return false; + + uint32 end_word = BM_INDEX_WORD_OFF(end_index); + uint32 end_bit_offset = BM_INDEX_BIT_OFF(end_index); + + for (uint32 i = 0; i < end_word && i < size; i++) { + if (bitmap[i] != 0) return false; + } + + // Check partial word at end + if (end_word < size && end_bit_offset > 0) { + T mask = (T(1) << end_bit_offset) - 1; + if (bitmap[end_word] & mask) return false; + } + return true; } + + inline bool Empty() const { return Empty(size * sizeof(T) * 8ULL); } + BitmapRaw() = default; BitmapRaw(T *buffer, size_t size) : bitmap(buffer), size(size) {} BitmapRaw(const BitmapRaw &) = delete; @@ -147,19 +163,9 @@ struct BitmapRaw final { raw.bitmap = nullptr; raw.size = 0; } - BitmapRaw &operator=(BitmapRaw) = delete; BitmapRaw &operator=(BitmapRaw &) = delete; BitmapRaw &operator=(const BitmapRaw &) = delete; - BitmapRaw &operator=(BitmapRaw &&raw) { - if (this != &raw) { - PAX_DELETE_ARRAY(bitmap); - bitmap = raw.bitmap; - size = raw.size; - raw.bitmap = nullptr; - raw.size = 0; - } - return *this; - } + BitmapRaw &operator=(BitmapRaw &&raw) = delete; ~BitmapRaw() = default; @@ -170,50 +176,57 @@ struct BitmapRaw final { template class BitmapTpl final { public: - using BitmapMemoryPolicy = void (*)(BitmapRaw &, uint32); - explicit BitmapTpl(uint32 initial_size = 16, - BitmapMemoryPolicy policy = DefaultBitmapMemoryPolicy) { + using BitmapMemoryPolicy = void (*)(BitmapRaw &, uint32, uint8); + explicit BitmapTpl(uint32 initial_size = 16, uint8 init_value = 0) { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); static_assert(BM_WORD_BITS == (1 << BM_WORD_SHIFTS)); - policy_ = policy; - policy(raw_, Max(initial_size, 16)); + policy_ = DefaultBitmapMemoryPolicy; + policy_(raw_, Max(initial_size, 16), init_value); + init_value_ = init_value; } - explicit BitmapTpl(const BitmapRaw &raw, BitmapMemoryPolicy policy) { + explicit BitmapTpl(const BitmapRaw &raw) { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); static_assert(BM_WORD_BITS == (1 << BM_WORD_SHIFTS)); - Assert(policy == ReadOnlyRefBitmap || policy == ReadOnlyOwnBitmap); - policy_ = policy; + policy_ = ReadOnlyRefBitmap; raw_.bitmap = raw.bitmap; raw_.size = raw.size; } BitmapTpl(const BitmapTpl &tpl) = delete; - BitmapTpl(BitmapTpl &&tpl) - : raw_(std::move(tpl.raw_)), policy_(tpl.policy_) {} + BitmapTpl(BitmapTpl &&tpl) : raw_(std::move(tpl.raw_)), policy_(tpl.policy_) { + tpl.raw_.bitmap = nullptr; + tpl.policy_ = ReadOnlyRefBitmap; + } BitmapTpl(BitmapRaw &&raw) - : raw_(std::move(raw)), policy_(DefaultBitmapMemoryPolicy) {} + : raw_(std::move(raw)), policy_(ReadOnlyRefBitmap) {} BitmapTpl &operator=(const BitmapTpl &tpl) = delete; BitmapTpl &operator=(BitmapTpl &&tpl) = delete; ~BitmapTpl() { // Reference doesn't free the memory - if (policy_ == ReadOnlyRefBitmap) raw_.bitmap = nullptr; + if (policy_ == DefaultBitmapMemoryPolicy) PAX_DELETE_ARRAY(raw_.bitmap); + raw_.bitmap = nullptr; } std::unique_ptr Clone() const { + auto bm = std::make_unique(raw_); auto p = PAX_NEW_ARRAY(raw_.size); memcpy(p, raw_.bitmap, sizeof(T) * raw_.size); - BitmapRaw bm_raw(p, raw_.size); - return std::make_unique(std::move(bm_raw)); + bm->raw_.bitmap = p; + bm->raw_.size = raw_.size; + bm->policy_ = DefaultBitmapMemoryPolicy; + return bm; } inline size_t WordBits() const { return BM_WORD_BITS; } inline void Set(uint32 index) { - if (unlikely(!raw_.HasEnoughSpace(index))) policy_(raw_, index); + if (unlikely(!raw_.HasEnoughSpace(index))) + policy_(raw_, index, init_value_); raw_.Set(index); } inline void SetN(uint32 index) { - if (unlikely(!raw_.HasEnoughSpace(index))) policy_(raw_, index); + if (unlikely(!raw_.HasEnoughSpace(index))) + policy_(raw_, index, init_value_); raw_.SetN(index); } inline void Clear(uint32 index) { @@ -232,7 +245,8 @@ class BitmapTpl final { } // invert the bit and return the old value. inline bool Toggle(uint32 index) { - if (unlikely(!raw_.HasEnoughSpace(index))) policy_(raw_, index); + if (unlikely(!raw_.HasEnoughSpace(index))) + policy_(raw_, index, init_value_); return raw_.Toggle(index); } // count bits in range [0, index] @@ -252,29 +266,31 @@ class BitmapTpl final { inline bool Empty() const { return raw_.Empty(); } + // check if the bitmap is empty in the range [0, end_index) + inline bool Empty(uint32 end_index) const { return raw_.Empty(end_index); } + BitmapMemoryPolicy Policy() const { return policy_; } const BitmapRaw &Raw() const { return raw_; } BitmapRaw &Raw() { return raw_; } - static void DefaultBitmapMemoryPolicy(BitmapRaw &raw, uint32 index) { + static void DefaultBitmapMemoryPolicy(BitmapRaw &raw, uint32 index, + uint8 init_value = 0) { auto old_bitmap = raw.bitmap; auto old_size = raw.size; auto size = Max(BM_INDEX_WORD_OFF(index) + 1, old_size * 2); auto p = PAX_NEW_ARRAY(size); if (old_size > 0) memcpy(p, old_bitmap, sizeof(T) * old_size); - memset(&p[old_size], 0, sizeof(T) * (size - old_size)); + memset(&p[old_size], init_value, sizeof(T) * (size - old_size)); raw.bitmap = p; raw.size = size; PAX_DELETE_ARRAY(old_bitmap); } - static void ReadOnlyRefBitmap(BitmapRaw & /*raw*/, uint32 /*index*/) { + static void ReadOnlyRefBitmap(BitmapRaw & /*raw*/, uint32 /*index*/, + uint8 /*init_value*/) { // raise CBDB_RAISE(cbdb::CException::kExTypeInvalidMemoryOperation); } - static void ReadOnlyOwnBitmap(BitmapRaw & /*raw*/, uint32 /*index*/) { - CBDB_RAISE(cbdb::CException::kExTypeInvalidMemoryOperation); - } static inline size_t RequireWords(size_t nbits) { return nbits ? ((nbits - 1) >> BM_WORD_SHIFTS) + 1 : 0; @@ -287,12 +303,14 @@ class BitmapTpl final { return nwords * sizeof(T); } - static std::unique_ptr> BitmapTplCopy(const BitmapTpl *bitmap) { + static std::unique_ptr> BitmapTplCopy( + const BitmapTpl *bitmap) { if (bitmap == nullptr) return nullptr; return bitmap->Clone(); } - static std::unique_ptr> Union(const BitmapTpl *a, const BitmapTpl *b) { + static std::unique_ptr> Union(const BitmapTpl *a, + const BitmapTpl *b) { std::unique_ptr> result; const BitmapTpl *large; const BitmapTpl *small; @@ -322,6 +340,7 @@ class BitmapTpl final { BitmapRaw raw_; BitmapMemoryPolicy policy_; + uint8 init_value_ = 0; }; using Bitmap8 = BitmapTpl; diff --git a/contrib/pax_storage/src/cpp/comm/bitmap_test.cc b/contrib/pax_storage/src/cpp/comm/bitmap_test.cc index 937a9c08d8b..dd2cc8094e5 100644 --- a/contrib/pax_storage/src/cpp/comm/bitmap_test.cc +++ b/contrib/pax_storage/src/cpp/comm/bitmap_test.cc @@ -36,7 +36,7 @@ TEST_F(BitMapTest, Bitmap8) { Bitmap8 bm(20); ASSERT_TRUE(bm.Empty()); - for (auto i = 0; i <= 128; i++) { + for (uint32 i = 0; i <= 128; i++) { ASSERT_FALSE(bm.Test(i)); // zeros ASSERT_FALSE(bm.Toggle(i)); ASSERT_TRUE(bm.Test(i)); @@ -61,14 +61,14 @@ TEST_F(BitMapTest, Bitmap8) { TEST_F(BitMapTest, Bitmap8SetN) { Bitmap8 bm(10); - const auto nbits = 128; + const uint32 nbits = 128; ASSERT_TRUE(bm.Empty()); - for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + for (uint32 i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); auto fn = [&bm, nbits](uint32 index) { bm.ClearAll(); - for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + for (uint32 i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); bm.SetN(index); for (uint32 i = 0; i <= index; i++) ASSERT_TRUE(bm.Test(i)); for (uint32 i = index + 1; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); @@ -78,13 +78,13 @@ TEST_F(BitMapTest, Bitmap8SetN) { TEST_F(BitMapTest, Bitmap8ClearN) { Bitmap8 bm(10); - const auto nbits = 128; + const uint32 nbits = 128; ASSERT_TRUE(bm.Empty()); - for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + for (uint32 i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); auto fn = [&bm, nbits](uint32 index) { - for (auto i = 0; i <= nbits; i++) { + for (uint32 i = 0; i <= nbits; i++) { bm.Set(i); ASSERT_TRUE(bm.Test(i)); } @@ -99,7 +99,7 @@ TEST_F(BitMapTest, Bitmap64) { Bitmap64 bm(100); ASSERT_TRUE(bm.Empty()); - for (auto i = 0; i <= 128; i++) { + for (uint32 i = 0; i <= 128; i++) { ASSERT_FALSE(bm.Test(i)); // zeros ASSERT_FALSE(bm.Toggle(i)); ASSERT_TRUE(bm.Test(i)); @@ -122,14 +122,14 @@ TEST_F(BitMapTest, Bitmap64) { } TEST_F(BitMapTest, Bitmap64SetN) { Bitmap64 bm(1); - const auto nbits = 512; + const uint32 nbits = 512; ASSERT_TRUE(bm.Empty()); - for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + for (uint32 i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); auto fn = [&bm, nbits](uint32 index) { bm.ClearAll(); - for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + for (uint32 i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); bm.SetN(index); for (uint32 i = 0; i <= index; i++) ASSERT_TRUE(bm.Test(i)); for (uint32 i = index + 1; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); @@ -139,13 +139,13 @@ TEST_F(BitMapTest, Bitmap64SetN) { TEST_F(BitMapTest, Bitmap64ClearN) { Bitmap64 bm(1); - const auto nbits = 512; + const uint32 nbits = 512; ASSERT_TRUE(bm.Empty()); - for (auto i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); + for (uint32 i = 0; i <= nbits; i++) ASSERT_FALSE(bm.Test(i)); auto fn = [&bm, &nbits](uint32 index) { - for (auto i = 0; i <= nbits; i++) { + for (uint32 i = 0; i <= nbits; i++) { bm.Set(i); ASSERT_TRUE(bm.Test(i)); } diff --git a/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.cc b/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.cc index b9d1a723028..3e54e965698 100644 --- a/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.cc +++ b/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.cc @@ -124,12 +124,6 @@ void cbdb::MemoryCtxRegisterResetCallback(MemoryContext context, CBDB_WRAP_END; } -Oid cbdb::RelationGetRelationId(Relation rel) { - CBDB_WRAP_START; - { return RelationGetRelid(rel); } - CBDB_WRAP_END; -} - #ifdef RUN_GTEST Datum cbdb::DatumFromCString(const char *src, size_t length) { CBDB_WRAP_START; diff --git a/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.h b/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.h index 05738a4b2ab..2031662357d 100644 --- a/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.h +++ b/contrib/pax_storage/src/cpp/comm/cbdb_wrappers.h @@ -114,8 +114,6 @@ void MemoryCtxDelete(MemoryContext memory_context); void MemoryCtxRegisterResetCallback(MemoryContext context, MemoryContextCallback *cb); -Oid RelationGetRelationId(Relation rel); - static inline void *DatumToPointer(Datum d) noexcept { return DatumGetPointer(d); } @@ -164,6 +162,10 @@ static inline float8 DatumToFloat8(Datum d) noexcept { return DatumGetFloat8(d); } +static pg_attribute_always_inline Oid RelationGetRelationId(Relation rel) noexcept { + return RelationGetRelid(rel); +} + BpChar *BpcharInput(const char *s, size_t len, int32 atttypmod); VarChar *VarcharInput(const char *s, size_t len, int32 atttypmod); text *CstringToText(const char *s, size_t len); diff --git a/contrib/pax_storage/src/cpp/comm/common_io.h b/contrib/pax_storage/src/cpp/comm/common_io.h new file mode 100644 index 00000000000..44730376054 --- /dev/null +++ b/contrib/pax_storage/src/cpp/comm/common_io.h @@ -0,0 +1,39 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * common_io.h + * + * IDENTIFICATION + * contrib/pax_storage/src/cpp/comm/common_io.h + * + *------------------------------------------------------------------------- + */ + +#pragma once +#include +#include + +namespace pax +{ +struct IORequest { + void* buffer; + size_t size; + off_t offset; +}; +} // namespace pax diff --git a/contrib/pax_storage/src/cpp/comm/fast_io.cc b/contrib/pax_storage/src/cpp/comm/fast_io.cc new file mode 100644 index 00000000000..7ed96d7a377 --- /dev/null +++ b/contrib/pax_storage/src/cpp/comm/fast_io.cc @@ -0,0 +1,145 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * fast_io.cc + * + * IDENTIFICATION + * contrib/pax_storage/src/cpp/comm/fast_io.cc + * + *------------------------------------------------------------------------- + */ + +#include "fast_io.h" + +#include // for pread + +// uring_likely may not be defined in older liburing versions +#ifndef uring_likely +#if __GNUC__ >= 3 +#define uring_likely(x) __builtin_expect((x) != 0, 1) +#else +#define uring_likely(x) ((x) != 0) +#endif +#endif + +namespace pax +{ + +bool IOUringFastIO::available() { + static char support_io_uring = 0; + + if (support_io_uring == 1) return true; + if (support_io_uring == -1) return false; + + struct io_uring ring; + bool supported = (io_uring_queue_init(128, &ring, 0) == 0); + if (supported) { + io_uring_queue_exit(&ring); + } + support_io_uring = supported ? 1 : -1; + return supported; +} + +// if pair.first == 0, all read requests are successful +// pair.second indicates the number of successful read requests +std::pair IOUringFastIO::read(int fd, std::vector &request, std::vector &result) { + size_t index = 0; + int success_read = 0; + int retcode = 0; + size_t completed = 0; + size_t total_requests = request.size(); + + // Implementation for synchronous read using io_uring + if (uring_likely(request.empty())) return {0, 0}; + if (status_ != 'i') return {-EINVAL, 0}; + + result.resize(request.size(), false); + + while (completed < total_requests) { + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + unsigned head; + unsigned count; + int rc; + // Submit read requests + while (index < total_requests) { + sqe = io_uring_get_sqe(&ring_); + if (!sqe) break; // No more SQEs available, retry later + + io_uring_prep_read(sqe, fd, request[index].buffer, request[index].size, request[index].offset); + io_uring_sqe_set_data(sqe, (void*)(uintptr_t)index); + index++; + } + + // submit and wait for completions + do { + rc = io_uring_submit_and_wait(&ring_, 1); + } while (rc == -EINTR); + if (rc < 0) return {rc, success_read}; + + count = 0; + io_uring_for_each_cqe(&ring_, head, cqe) { + size_t req_index = (size_t)(uintptr_t)io_uring_cqe_get_data(cqe); + if (cqe->res >= 0) { + // Successful read + result[req_index] = true; + success_read++; + } else if (retcode == 0) { + retcode = cqe->res; // capture the first error + } + completed++; + count++; + } + io_uring_cq_advance(&ring_, count); + } + return {retcode, success_read}; // Placeholder +} + +std::pair SyncFastIO::read(int fd, std::vector &request, std::vector &result) { + size_t total_requests = request.size(); + if (total_requests == 0) return {0, 0}; + + result.resize(total_requests, false); + + int success_read = 0; + int retcode = 0; + + for (size_t i = 0; i < total_requests; ++i) { + ssize_t bytes_read = 0; + ssize_t nbytes; + auto &req = request[i]; + do { + nbytes = pread(fd, (char *)req.buffer + bytes_read, req.size - bytes_read, req.offset + bytes_read); + if (nbytes > 0) bytes_read += nbytes; + } while ((nbytes == -1 && errno == EINTR) || (nbytes > 0 && static_cast(bytes_read) < req.size)); + + if (bytes_read < 0) { + if (retcode == 0) { + retcode = static_cast(bytes_read); // capture first error + } + } else if (static_cast(bytes_read) == request[i].size) { + result[i] = true; + success_read++; + } + } + + return {retcode, success_read}; +} + +} // namespace pax \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/comm/fast_io.h b/contrib/pax_storage/src/cpp/comm/fast_io.h new file mode 100644 index 00000000000..da63b4d89ea --- /dev/null +++ b/contrib/pax_storage/src/cpp/comm/fast_io.h @@ -0,0 +1,88 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * fast_io.h + * + * IDENTIFICATION + * contrib/pax_storage/src/cpp/comm/fast_io.h + * + *------------------------------------------------------------------------- + */ + +#pragma once + +#include "comm/common_io.h" + +#include +#include +#include +#include +#include + +namespace pax +{ + +template +int fast_io_read(int fd, std::vector &request) { + T io_handler(request.size()); + return io_handler.read(fd, request).first; +} + +template +std::pair fast_io_read2(int fd, std::vector &request) { + T io_handler(request.size()); + return io_handler.read(fd, request); +} + +class SyncFastIO { +public: + SyncFastIO(size_t dummy_queue_size = 0) {} + std::pair read(int fd, std::vector &request, std::vector &result); +}; + +// io_uring-based FastIO +class IOUringFastIO { +public: + IOUringFastIO(size_t queue_size = 128) { + int ret = io_uring_queue_init(std::max(queue_size, static_cast(128)), &ring_, 0); + + // ret < 0: unsupported + // otherwise initialized + status_ = ret < 0 ? 'x' : 'i'; + } + + ~IOUringFastIO() { + if (status_ == 'i') + io_uring_queue_exit(&ring_); + } + + static bool available(); + + // if pair.first == 0, all read requests are successful + // pair.second indicates the number of successful read requests + std::pair read(int fd, std::vector &request, std::vector &result); + +private: + struct io_uring ring_; + + // 'u' for uninitialized, 'i' for initialized, 'x' for unsupported + char status_ = 'u'; +}; + +} // namespace pax \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/comm/guc.cc b/contrib/pax_storage/src/cpp/comm/guc.cc index 6cdc429a3eb..8a6497d1db3 100644 --- a/contrib/pax_storage/src/cpp/comm/guc.cc +++ b/contrib/pax_storage/src/cpp/comm/guc.cc @@ -81,8 +81,8 @@ static bool CheckTuplePerGroup(int *newval, void **extra, GucSource source) { bool ok = *newval <= pax::pax_max_tuples_per_file; if (!ok) { elog(WARNING, - "The guc pax_max_tuples_per_group should LE with " - "pax_max_tuples_per_file"); + "The guc pax.max_tuples_per_group should LE with " + "pax.max_tuples_per_file"); } return ok; } @@ -91,8 +91,8 @@ static bool CheckTuplePerFile(int *newval, void **extra, GucSource source) { bool ok = *newval >= pax::pax_max_tuples_per_group; if (!ok) { elog(WARNING, - "The guc pax_max_tuples_per_file should BE with " - "pax_max_tuples_per_group"); + "The guc pax.max_tuples_per_file should BE with " + "pax.max_tuples_per_group"); } return ok; } @@ -102,8 +102,8 @@ static bool CheckMinCompressToastSize(int *newval, void **extra, bool ok = *newval < pax::pax_min_size_of_external_toast; if (!ok) { elog(WARNING, - "The guc pax_min_size_of_compress_toast should LT with " - "pax_min_size_of_external_toast"); + "The guc pax.min_size_of_compress_toast should LT with " + "pax.min_size_of_external_toast"); } return ok; } @@ -113,8 +113,8 @@ static bool CheckMinExternalToastSize(int *newval, void **extra, bool ok = *newval > pax::pax_min_size_of_compress_toast; if (!ok) { elog(WARNING, - "The guc pax_min_size_of_external_toast should BT with " - "pax_min_size_of_compress_toast"); + "The guc pax.min_size_of_external_toast should BT with " + "pax.min_size_of_compress_toast"); } return ok; } @@ -126,71 +126,71 @@ static bool CheckDefaultStorageFormat(char **newval, void **extra, } void DefineGUCs() { - DefineCustomBoolVariable("pax_enable_debug", "enable pax debug", NULL, + DefineCustomBoolVariable("pax.enable_debug", "enable pax debug", NULL, &pax::pax_enable_debug, false, PGC_USERSET, GUC_GPDB_NEED_SYNC, NULL, NULL, NULL); - DefineCustomBoolVariable("pax_enable_sparse_filter", + DefineCustomBoolVariable("pax.enable_sparse_filter", "enable pax filter, contains min/max and bloom " "filters for sparse filtering", NULL, &pax::pax_enable_sparse_filter, true, PGC_USERSET, 0, NULL, NULL, NULL); - DefineCustomBoolVariable("pax_enable_row_filter", "enable pax row filter", + DefineCustomBoolVariable("pax.enable_row_filter", "enable pax row filter", NULL, &pax::pax_enable_row_filter, false, PGC_USERSET, 0, NULL, NULL, NULL); DefineCustomIntVariable( - "pax_scan_reuse_buffer_size", "set the reuse buffer size", NULL, + "pax.scan_reuse_buffer_size", "set the reuse buffer size", NULL, &pax::pax_scan_reuse_buffer_size, PAX_SCAN_REUSE_BUFFER_DEFAULT_SIZE, PAX_SCAN_REUSE_BUFFER_MIN_SIZE, PAX_SCAN_REUSE_BUFFER_MAX_SIZE, PGC_USERSET, GUC_GPDB_NEED_SYNC, NULL, NULL, NULL); DefineCustomIntVariable( - "pax_max_tuples_per_group", + "pax.max_tuples_per_group", "the default value for the limit on the number of tuples in a group", NULL, &pax::pax_max_tuples_per_group, PAX_MAX_TUPLES_PER_GROUP_DEFAULT, PAX_MAX_TUPLES_PER_GROUP_MIN, PAX_MAX_TUPLES_PER_GROUP_MAX, PGC_USERSET, 0, CheckTuplePerGroup, NULL, NULL); DefineCustomIntVariable( - "pax_max_tuples_per_file", + "pax.max_tuples_per_file", "the default value for the limit on the number of tuples in a file", NULL, &pax::pax_max_tuples_per_file, PAX_MAX_TUPLES_PER_FILE_DEFAULT, PAX_MAX_TUPLES_PER_FILE_MIN, PAX_MAX_NUM_TUPLES_PER_FILE, PGC_USERSET, 0, CheckTuplePerFile, NULL, NULL); DefineCustomIntVariable( - "pax_max_size_per_file", + "pax.max_size_per_file", "the default value for the limit on the number of tuples in a file", NULL, &pax::pax_max_size_per_file, PAX_MAX_SIZE_PER_FILE_DEFAULT, PAX_MAX_SIZE_PER_FILE_MIN, PAX_MAX_SIZE_PER_FILE_MAX, PGC_USERSET, 0, NULL, NULL, NULL); - DefineCustomBoolVariable("pax_enable_toast", "enable pax toast", NULL, + DefineCustomBoolVariable("pax.enable_toast", "enable pax toast", NULL, &pax::pax_enable_toast, true, PGC_USERSET, 0, NULL, NULL, NULL); DefineCustomIntVariable( - "pax_min_size_of_compress_toast", + "pax.min_size_of_compress_toast", "the minimum value for creating compress toast", NULL, &pax::pax_min_size_of_compress_toast, PAX_MIN_SIZE_MAKE_COMPRESSED_TOAST, PAX_MIN_SIZE_MAKE_COMPRESSED_TOAST, PAX_MAX_SIZE_MAKE_COMPRESSED_TOAST, PGC_USERSET, 0, CheckMinCompressToastSize, NULL, NULL); DefineCustomIntVariable( - "pax_min_size_of_external_toast", + "pax.min_size_of_external_toast", "the minimum value for creating external toast", NULL, &pax::pax_min_size_of_external_toast, PAX_MIN_SIZE_MAKE_EXTERNAL_TOAST, PAX_MIN_SIZE_MAKE_EXTERNAL_TOAST, INT_MAX, PGC_USERSET, 0, CheckMinExternalToastSize, NULL, NULL); DefineCustomStringVariable( - "pax_default_storage_format", "the default storage format", NULL, + "pax.default_storage_format", "the default storage format", NULL, &pax::pax_default_storage_format, STORAGE_FORMAT_TYPE_DEFAULT, PGC_USERSET, GUC_GPDB_NEED_SYNC, CheckDefaultStorageFormat, NULL, NULL); - DefineCustomIntVariable("pax_bloom_filter_work_memory_bytes", + DefineCustomIntVariable("pax.bloom_filter_work_memory_bytes", "the bloom filter work memory(only used on write)", NULL, &pax::pax_bloom_filter_work_memory_bytes, PAX_BLOOM_FILTER_WORK_MEMORY_BYTES, @@ -198,7 +198,7 @@ void DefineGUCs() { PAX_MAX_BLOOM_FILTER_WORK_MEMORY_BYTES, PGC_USERSET, 0, NULL, NULL, NULL); - DefineCustomBoolVariable("pax_log_filter_tree", "Log the filter tree", NULL, + DefineCustomBoolVariable("pax.log_filter_tree", "Log the filter tree", NULL, &pax::pax_log_filter_tree, false, PGC_USERSET, 0, NULL, NULL, NULL); } diff --git a/contrib/pax_storage/src/cpp/manifest/manifest.c b/contrib/pax_storage/src/cpp/manifest/manifest.c index fb246a35b0c..c45efcb17a0 100644 --- a/contrib/pax_storage/src/cpp/manifest/manifest.c +++ b/contrib/pax_storage/src/cpp/manifest/manifest.c @@ -245,8 +245,6 @@ static void manifestvalue_to_json(yyjson_mut_doc *doc, yyjson_mut_val *obj, } Datum json_to_manifestvalue(yyjson_val *type_val, MetaFieldType typ) { - Datum value; - switch (typ) { case Meta_Field_Type_Int: return Int32GetDatum(yyjson_get_int(type_val)); @@ -312,8 +310,7 @@ static const char *manifest_to_json(ManifestHeap *m, size_t *len) { static bool json_to_manifestheap(ManifestHeap *m, char *buf, int len) { size_t idx, max; - yyjson_val *datameta, *add, *remove; - const char *summary; + yyjson_val *datameta; yyjson_read_err err; yyjson_doc *doc = @@ -411,7 +408,6 @@ void serialize_manifest(Relation rel, ManifestHeap *manifest, char *manifest_pax * we need refresh the new manifest file path in the top entrance table */ void update_manifest_top_entrance(Relation rel, const char *manifest_path) { - Oid aux_oid; Relation aux_rel; TupleDesc desc; SysScanDesc scan; @@ -562,7 +558,6 @@ Oid create_manifest_auxiliary_heap(Relation parentrel) { */ void manifest_create(Relation rel, RelFileNode relnode) { char aux_relname[NAMEDATALEN]; - ReindexParams reindex_params = {0}; Relation aux_rel; Oid aux_oid; bool exists; diff --git a/contrib/pax_storage/src/cpp/manifest/tuple.c b/contrib/pax_storage/src/cpp/manifest/tuple.c index 6ac104ec1f9..b956058836e 100644 --- a/contrib/pax_storage/src/cpp/manifest/tuple.c +++ b/contrib/pax_storage/src/cpp/manifest/tuple.c @@ -284,7 +284,6 @@ void manifest_insert(ManifestRelation mrel, const MetaValue data[], MemoryContext oldctx; ManifestHeap *mheap; ManifestTuple mtuple; - Relation rel; oldctx = MemoryContextSwitchTo(mrel->mctx); diff --git a/contrib/pax_storage/src/cpp/pax_gbench.cc b/contrib/pax_storage/src/cpp/pax_gbench.cc index 82dbaaa7bb2..b6a0ecb0c76 100644 --- a/contrib/pax_storage/src/cpp/pax_gbench.cc +++ b/contrib/pax_storage/src/cpp/pax_gbench.cc @@ -25,12 +25,310 @@ *------------------------------------------------------------------------- */ +#include "pax_gbench.h" + +#include "comm/cbdb_api.h" + #include -static void example_benchmark(benchmark::State &state) { +#include +#include +#include + +#include "access/paxc_rel_options.h" +#include "comm/cbdb_wrappers.h" +#include "cpp-stub/src/stub.h" +#include "storage/micro_partition_iterator.h" +#include "storage/pax.h" +#include "storage/strategy.h" + +namespace pax::bench { + +// Create memory context for benchmark +void CreateMemoryContext() { + MemoryContext test_memory_context = AllocSetContextCreate( + (MemoryContext)NULL, "TestMemoryContext", 80 * 1024 * 1024, + 80 * 1024 * 1024, 80 * 1024 * 1024); + MemoryContextSwitchTo(test_memory_context); +} + +// Global registry +class BenchmarkRegistry { + private: + std::vector init_functions_; + std::vector cleanup_functions_; + bool initialized_ = false; + + public: + void RegisterInitFunction(InitFunction func) { + init_functions_.push_back(func); + } + + void RegisterCleanupFunction(CleanupFunction func) { + cleanup_functions_.push_back(func); + } + + void RunAllInitFunctions() { + if (initialized_) return; + + printf("Running PAX Benchmark Suite...\n"); + printf("Initializing all benchmark modules...\n\n"); + + for (const auto &func : init_functions_) { + func(); + } + initialized_ = true; + } + + void RunAllCleanupFunctions() { + if (!initialized_) return; + + printf("\nCleaning up all benchmark modules...\n"); + + // Cleanup functions executed in reverse order + for (auto it = cleanup_functions_.rbegin(); it != cleanup_functions_.rend(); + ++it) { + (*it)(); + } + initialized_ = false; + } +}; + +// Global registry access function +BenchmarkRegistry &GetBenchmarkRegistry() { + static BenchmarkRegistry instance; + return instance; +} + +// Registration functions +void RegisterBenchmarkInit(InitFunction func) { + GetBenchmarkRegistry().RegisterInitFunction(func); +} + +void RegisterBenchmarkCleanup(CleanupFunction func) { + GetBenchmarkRegistry().RegisterCleanupFunction(func); +} + +// Global Mock functions for benchmark framework +bool MockMinMaxGetStrategyProcinfo(Oid, Oid, Oid *, FmgrInfo *, + StrategyNumber) { + return false; +} + +int32 MockGetFastSequences(Oid) { + static int32 mock_id = 0; + return mock_id++; +} + +void MockInsertMicroPartitionPlaceHolder(Oid, int) {} +void MockDeleteMicroPartitionEntry(Oid, Snapshot, int) {} +void MockExecStoreVirtualTuple(TupleTableSlot *) {} + +std::string MockBuildPaxDirectoryPath(RelFileNode rnode, BackendId backend_id) { + // Create a simple file path for benchmarks + return std::string("./bench_data"); +} + +std::vector MockGetMinMaxColumnIndexes(Relation) { + return std::vector(); +} + +std::vector MockBloomFilterColumnIndexes(Relation) { + return std::vector(); +} + +std::vector> MockGetRelEncodingOptions( + Relation relation) { + std::vector> encoding_opts; + + // Get number of columns from relation + int num_columns = 10; // default for benchmark + if (relation && relation->rd_att) { + num_columns = relation->rd_att->natts; + } + + // Create encoding options for each column (NO_ENCODED, 0) + for (int i = 0; i < num_columns; i++) { + encoding_opts.emplace_back( + std::make_tuple(ColumnEncoding_Kind_NO_ENCODED, 0)); + } + + return encoding_opts; +} + +// Mock TupleDescInitEntry that doesn't rely on SYSCACHE +void MockTupleDescInitEntry(TupleDesc desc, AttrNumber attributeNumber, + const char *attributeName, Oid oidtypeid, + int32 typmod, int attdim) { + // Basic validation + if (attributeNumber < 1 || attributeNumber > desc->natts) { + return; + } + + Form_pg_attribute att = TupleDescAttr(desc, attributeNumber - 1); + + // Set basic attribute properties + namestrcpy(&(att->attname), attributeName); + att->atttypid = oidtypeid; + att->atttypmod = typmod; + att->attndims = attdim; + att->attnum = attributeNumber; + att->attnotnull = false; + att->atthasdef = false; + att->attidentity = '\0'; + att->attgenerated = '\0'; + att->attisdropped = false; + att->attislocal = true; + att->attinhcount = 0; + att->attcollation = InvalidOid; + + // Set type-specific properties based on OID (hardcoded for common types) + switch (oidtypeid) { + case INT2OID: // smallint + att->attlen = 2; + att->attalign = 's'; + att->attstorage = 'p'; + att->attbyval = true; + break; + case INT4OID: // integer + att->attlen = 4; + att->attalign = 'i'; + att->attstorage = TYPSTORAGE_PLAIN; + att->attbyval = true; + break; + case INT8OID: // bigint + att->attlen = 8; + att->attalign = 'd'; + att->attstorage = TYPSTORAGE_PLAIN; + att->attbyval = FLOAT8PASSBYVAL; + break; + case FLOAT8OID: // double precision + att->attlen = 8; + att->attalign = 'd'; + att->attstorage = 'p'; + att->attbyval = FLOAT8PASSBYVAL; + break; + case BOOLOID: // boolean + att->attlen = 1; + att->attalign = 'c'; + att->attstorage = 'p'; + att->attbyval = true; + break; + case TEXTOID: // text + att->attlen = -1; + att->attalign = 'i'; + att->attstorage = TYPSTORAGE_PLAIN; + att->attbyval = false; + att->attcollation = DEFAULT_COLLATION_OID; + break; + case NUMERICOID: // numeric + att->attlen = -1; + att->attalign = TYPALIGN_INT; + att->attstorage = TYPSTORAGE_PLAIN; + att->attbyval = false; + break; + case TIMESTAMPOID: // timestamp + att->attlen = 8; + att->attalign = 'd'; + att->attstorage = TYPSTORAGE_PLAIN; + att->attbyval = FLOAT8PASSBYVAL; + break; + default: + // Default values for unknown types + att->attlen = -1; + att->attalign = 'i'; + att->attstorage = 'p'; + att->attbyval = false; + break; + } +} + +// Global initialization function for general benchmark framework +void GlobalBenchmarkInit() { + static bool global_initialized = false; + if (global_initialized) return; + + printf("Initializing PAX benchmark framework...\n"); + + // Initialize memory context + MemoryContextInit(); + + // Setup global Mock functions + static std::unique_ptr stub_global = std::make_unique(); + + stub_global->set(MinMaxGetPgStrategyProcinfo, MockMinMaxGetStrategyProcinfo); + stub_global->set(CPaxGetFastSequences, MockGetFastSequences); + stub_global->set(cbdb::BuildPaxDirectoryPath, MockBuildPaxDirectoryPath); + stub_global->set(cbdb::InsertMicroPartitionPlaceHolder, + MockInsertMicroPartitionPlaceHolder); + stub_global->set(cbdb::DeleteMicroPartitionEntry, + MockDeleteMicroPartitionEntry); + stub_global->set(cbdb::GetMinMaxColumnIndexes, MockGetMinMaxColumnIndexes); + stub_global->set(cbdb::GetBloomFilterColumnIndexes, + MockBloomFilterColumnIndexes); + stub_global->set(cbdb::GetRelEncodingOptions, MockGetRelEncodingOptions); + stub_global->set(ExecStoreVirtualTuple, MockExecStoreVirtualTuple); + stub_global->set(TupleDescInitEntry, MockTupleDescInitEntry); + + // Create basic test directory + system("mkdir -p ./bench_data"); + + global_initialized = true; + printf("PAX benchmark framework initialized.\n"); +} + +// Global cleanup function for general benchmark framework +void GlobalBenchmarkCleanup() { + printf("Cleaning up PAX benchmark framework...\n"); + + // Clean up test directory + // system("rm -rf ./bench_data"); + + // Reset memory context + if (TopMemoryContext) { + MemoryContextReset(TopMemoryContext); + } + + printf("PAX benchmark framework cleaned up.\n"); +} + +// Example benchmark test +static void example_benchmark(::benchmark::State &state) { for (auto _ : state) { + // Empty example test } } BENCHMARK(example_benchmark); -BENCHMARK_MAIN(); \ No newline at end of file +} // namespace pax::benchmark + +// Global cleanup function (C-style for atexit) +static void cleanup_all() { + pax::bench::GetBenchmarkRegistry().RunAllCleanupFunctions(); + pax::bench::GlobalBenchmarkCleanup(); +} + +// Main entry function +int main(int argc, char **argv) { + // Register global cleanup function + std::atexit(cleanup_all); + + // Global initialization + pax::bench::GlobalBenchmarkInit(); + + // Run all registered initialization functions + pax::bench::GetBenchmarkRegistry().RunAllInitFunctions(); + + // Initialize benchmark framework + ::benchmark::Initialize(&argc, argv); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; + + printf("\n=== Starting PAX Benchmark Suite ===\n"); + printf("Use --benchmark_filter= to run specific tests\n"); + printf("Use --benchmark_list_tests to see all available tests\n\n"); + + // Run benchmark + ::benchmark::RunSpecifiedBenchmarks(); + + return 0; +} \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/pax_gbench.h b/contrib/pax_storage/src/cpp/pax_gbench.h new file mode 100644 index 00000000000..44376022693 --- /dev/null +++ b/contrib/pax_storage/src/cpp/pax_gbench.h @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * pax_gbench.h + * + * IDENTIFICATION + * contrib/pax_storage/src/cpp/pax_gbench.h + * + *------------------------------------------------------------------------- + */ + +#pragma once + +#include +#include + +namespace pax { + +namespace bench { + +// Generic initialization and cleanup function types +using InitFunction = std::function; +using CleanupFunction = std::function; + +// Create memory context for benchmark +extern void CreateMemoryContext(); + +// Forward declaration +class BenchmarkRegistry; + +// Global registry access function +BenchmarkRegistry &GetBenchmarkRegistry(); + +// Global initialization and cleanup functions +void GlobalBenchmarkInit(); +void GlobalBenchmarkCleanup(); + +// Registration functions (implemented in pax_gbench.cc) +void RegisterBenchmarkInit(InitFunction func); +void RegisterBenchmarkCleanup(CleanupFunction func); + +} // namespace benchmark +} // namespace pax + +// Convenient registration macros +#define REGISTER_BENCHMARK_INIT(func) \ + static bool BENCHMARK_INIT_##__COUNTER__ = []() { \ + pax::bench::RegisterBenchmarkInit(func); \ + return true; \ + }() + +#define REGISTER_BENCHMARK_CLEANUP(func) \ + static bool BENCHMARK_CLEANUP_##__COUNTER__ = []() { \ + pax::bench::RegisterBenchmarkCleanup(func); \ + return true; \ + }() diff --git a/contrib/pax_storage/src/cpp/pax_gtest_helper.cc b/contrib/pax_storage/src/cpp/pax_gtest_helper.cc index f95e3549aca..89a8660baf6 100644 --- a/contrib/pax_storage/src/cpp/pax_gtest_helper.cc +++ b/contrib/pax_storage/src/cpp/pax_gtest_helper.cc @@ -63,35 +63,53 @@ void ReleaseTestResourceOwner() { ResourceOwnerDelete(tmp_resource_owner); } +void InitAttribute_text(Form_pg_attribute attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->atttypid = TEXTOID; + attr->attlen = -1; + attr->attbyval = false; + attr->attalign = TYPALIGN_DOUBLE; + attr->attstorage = TYPSTORAGE_PLAIN; + attr->attisdropped = false; + attr->attcollation = DEFAULT_COLLATION_OID; +} + +void InitAttribute_int4(Form_pg_attribute attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->atttypid = INT4OID; + attr->attlen = 4; + attr->attbyval = true; + attr->attalign = TYPALIGN_INT; + attr->attstorage = TYPSTORAGE_PLAIN; + attr->attisdropped = false; + attr->attcollation = InvalidOid; +} + +void InitAttribute_int8(Form_pg_attribute attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->atttypid = INT8OID; + attr->attlen = 8; + attr->attbyval = true; + attr->attalign = TYPALIGN_DOUBLE; + attr->attstorage = TYPSTORAGE_PLAIN; + attr->attisdropped = false; + attr->attcollation = InvalidOid; +} + static TupleDesc CreateTestTupleDesc(int ncols) { Assert(ncols >= COLUMN_NUMS); auto tuple_desc = reinterpret_cast(cbdb::Palloc0( sizeof(TupleDescData) + sizeof(FormData_pg_attribute) * ncols)); tuple_desc->natts = COLUMN_NUMS; - tuple_desc->attrs[0] = {.atttypid = TEXTOID, - .attlen = -1, - .attbyval = false, - .attalign = TYPALIGN_DOUBLE, - .attstorage = TYPSTORAGE_PLAIN, - .attisdropped = false, - .attcollation = DEFAULT_COLLATION_OID}; - - tuple_desc->attrs[1] = {.atttypid = TEXTOID, - .attlen = -1, - .attbyval = false, - .attalign = TYPALIGN_DOUBLE, - .attstorage = TYPSTORAGE_PLAIN, - .attisdropped = false, - .attcollation = DEFAULT_COLLATION_OID}; - - tuple_desc->attrs[2] = {.atttypid = INT4OID, - .attlen = 4, - .attbyval = true, - .attalign = TYPALIGN_INT, - .attstorage = TYPSTORAGE_PLAIN, - .attisdropped = false, - .attcollation = InvalidOid}; + + InitAttribute_text(&tuple_desc->attrs[0]); + InitAttribute_text(&tuple_desc->attrs[1]); + InitAttribute_int4(&tuple_desc->attrs[2]); + return tuple_desc; } diff --git a/contrib/pax_storage/src/cpp/pax_gtest_helper.h b/contrib/pax_storage/src/cpp/pax_gtest_helper.h index b9c2f42eeaa..7a78b5fdf19 100644 --- a/contrib/pax_storage/src/cpp/pax_gtest_helper.h +++ b/contrib/pax_storage/src/cpp/pax_gtest_helper.h @@ -60,4 +60,8 @@ extern void DeleteTestTupleTableSlot(TupleTableSlot *tuple_slot); extern void GenTextBuffer(char *buffer, size_t length); extern std::vector CreateTestSchemaTypes(); + +extern void InitAttribute_text(Form_pg_attribute attr); +extern void InitAttribute_int4(Form_pg_attribute attr); +extern void InitAttribute_int8(Form_pg_attribute attr); } // namespace pax::tests diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_column.cc index 2d4bd6b7d0f..7dc2ffcd146 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_column.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column.cc @@ -83,13 +83,21 @@ size_t PaxColumn::GetRangeNonNullRows(size_t start_pos, size_t len) { void PaxColumn::CreateNulls(size_t cap) { Assert(!null_bitmap_); - null_bitmap_ = std::make_shared(cap); - null_bitmap_->SetN(total_rows_); + // By default, initialize every bit in the null bitmap to 1. + // This is based on the assumption that null values are much less frequent + // than non-null values in most datasets. As a result, when appending non-null + // values, we can simply skip setting the bit to 1, since it is already set. + // Only when appending a null value do we need to explicitly clear the + // corresponding bit. + null_bitmap_ = std::make_unique(cap, 0xff); } void PaxColumn::AppendNull() { if (!null_bitmap_) { - CreateNulls(DEFAULT_CAPACITY); + // Ensure that the capacity of null_bitmap_ is pax_max_tuples_per_group. + // This design allows the use of raw_bitmap in normal cases without + // incurring the overhead of checking the bitmap's capacity. + CreateNulls(pax::pax_max_tuples_per_group); } null_bitmap_->Clear(total_rows_); ++total_rows_; @@ -111,7 +119,6 @@ void PaxColumn::AppendToast(char *buffer, size_t size) { } void PaxColumn::Append(char * /*buffer*/, size_t /*size*/) { - if (null_bitmap_) null_bitmap_->Set(total_rows_); ++total_rows_; ++non_null_rows_; } @@ -208,7 +215,7 @@ std::string PaxColumn::DebugString() { template PaxCommColumn::PaxCommColumn(uint32 capacity) { - data_ = std::make_shared>(capacity * sizeof(T)); + data_ = std::make_unique>(capacity * sizeof(T)); } template @@ -218,7 +225,7 @@ template // NOLINT: redirect constructor PaxCommColumn::PaxCommColumn() : PaxCommColumn(DEFAULT_CAPACITY) {} template -void PaxCommColumn::Set(std::shared_ptr> data) { +void PaxCommColumn::Set(std::unique_ptr> data) { data_ = std::move(data); } @@ -318,8 +325,8 @@ template class PaxCommColumn; PaxNonFixedColumn::PaxNonFixedColumn(uint32 data_capacity, uint32 offsets_capacity) : estimated_size_(0), - data_(std::make_shared>(data_capacity)), - offsets_(std::make_shared>(offsets_capacity)), + data_(std::make_unique>(data_capacity)), + offsets_(std::make_unique>(offsets_capacity)), next_offsets_(0) {} PaxNonFixedColumn::PaxNonFixedColumn() @@ -327,8 +334,8 @@ PaxNonFixedColumn::PaxNonFixedColumn() PaxNonFixedColumn::~PaxNonFixedColumn() {} -void PaxNonFixedColumn::Set(std::shared_ptr> data, - std::shared_ptr> offsets, +void PaxNonFixedColumn::Set(std::unique_ptr> data, + std::unique_ptr> offsets, size_t total_size) { estimated_size_ = total_size; data_ = std::move(data); diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column.h b/contrib/pax_storage/src/cpp/storage/columns/pax_column.h index f17320c0441..0adacb06829 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_column.h +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column.h @@ -46,7 +46,8 @@ namespace pax { -#define DEFAULT_CAPACITY 2048 +#define DEFAULT_CAPACITY \ + MIN(2048, MAX(16, MAXALIGN(pax::pax_max_tuples_per_group))) // Used to mapping pg_type enum PaxColumnTypeInMem { @@ -230,16 +231,25 @@ class PaxColumn { inline bool HasNull() { return null_bitmap_ != nullptr; } // Are all values null? - inline bool AllNull() const { return null_bitmap_ && null_bitmap_->Empty(); } + // Check whether all bits in the specified range are zero. + // In pax_column, to avoid checking the capacity of the null bitmap, we + // allocate memory based on pax_max_tuples_per_group. As a result, the last + // group may contain fewer tuples than pax_max_tuples_per_group, so we need to + // check whether all bits in the range [0, total_rows_) are zero. + inline bool AllNull() const { + return null_bitmap_ && null_bitmap_->Empty(total_rows_); + } // Set the null bitmap - inline void SetBitmap(std::shared_ptr null_bitmap) { + inline void SetBitmap(std::unique_ptr null_bitmap) { Assert(!null_bitmap_); null_bitmap_ = std::move(null_bitmap); } // Get the null bitmap - inline std::shared_ptr GetBitmap() const { return null_bitmap_; } + inline const std::unique_ptr &GetBitmap() const { + return null_bitmap_; + } // Set the column kv attributes void SetAttributes(const std::map &attrs); @@ -354,7 +364,7 @@ class PaxColumn { protected: // null field bit map - std::shared_ptr null_bitmap_; + std::unique_ptr null_bitmap_; // Writer: write pointer // Reader: total rows @@ -425,7 +435,7 @@ class PaxCommColumn : public PaxColumn { PaxCommColumn(); - virtual void Set(std::shared_ptr> data); + virtual void Set(std::unique_ptr> data); PaxColumnTypeInMem GetPaxColumnTypeInMem() const override; @@ -455,7 +465,7 @@ class PaxCommColumn : public PaxColumn { int32 GetTypeLength() const override; protected: - std::shared_ptr> data_; + std::unique_ptr> data_; }; extern template class PaxCommColumn; @@ -474,8 +484,8 @@ class PaxNonFixedColumn : public PaxColumn { ~PaxNonFixedColumn() override; - virtual void Set(std::shared_ptr> data, - std::shared_ptr> offsets, + virtual void Set(std::unique_ptr> data, + std::unique_ptr> offsets, size_t total_size); void Append(char *buffer, size_t size) override; @@ -514,13 +524,13 @@ class PaxNonFixedColumn : public PaxColumn { protected: size_t estimated_size_; - std::shared_ptr> data_; + std::unique_ptr> data_; // orc needs to serialize int32 array // the length of a single tuple field will not exceed 2GB, // so a variable-length element of the offsets stream can use int32 // to represent the length - std::shared_ptr> offsets_; + std::unique_ptr> offsets_; // used to record next offset in write path // in read path, next_offsets_ always be -1 diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_column_test.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_column_test.cc index dfd346ef615..b1c75c39217 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_column_test.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_column_test.cc @@ -137,7 +137,7 @@ static std::unique_ptr CreateDecodeColumn( std::unique_ptr column_rc; switch (bits) { case 16: { - auto buffer_for_read = std::make_shared>( + auto buffer_for_read = std::make_unique>( reinterpret_cast(encoded_buff), encoded_len, false, false); buffer_for_read->Brush(encoded_len); @@ -145,19 +145,19 @@ static std::unique_ptr CreateDecodeColumn( auto int_column = ColumnOptCreateTraits::create_decoding( origin_len / sizeof(int16), std::move(decoding_option)); - int_column->Set(buffer_for_read); + int_column->Set(std::move(buffer_for_read)); column_rc = std::move(int_column); } else { auto int_column = ColumnOptCreateTraits::create_decoding( origin_len / sizeof(int16), std::move(decoding_option)); - int_column->Set(buffer_for_read, column_not_nulls); + int_column->Set(std::move(buffer_for_read), column_not_nulls); column_rc = std::move(int_column); } break; } case 32: { - auto buffer_for_read = std::make_shared>( + auto buffer_for_read = std::make_unique>( reinterpret_cast(encoded_buff), encoded_len, false, false); buffer_for_read->Brush(encoded_len); @@ -165,19 +165,19 @@ static std::unique_ptr CreateDecodeColumn( auto int_column = ColumnOptCreateTraits::create_decoding( origin_len / sizeof(int32), std::move(decoding_option)); - int_column->Set(buffer_for_read); + int_column->Set(std::move(buffer_for_read)); column_rc = std::move(int_column); } else { auto int_column = ColumnOptCreateTraits::create_decoding( origin_len / sizeof(int32), std::move(decoding_option)); - int_column->Set(buffer_for_read, column_not_nulls); + int_column->Set(std::move(buffer_for_read), column_not_nulls); column_rc = std::move(int_column); } break; } case 64: { - auto buffer_for_read = std::make_shared>( + auto buffer_for_read = std::make_unique>( reinterpret_cast(encoded_buff), encoded_len, false, false); buffer_for_read->Brush(encoded_len); @@ -185,13 +185,13 @@ static std::unique_ptr CreateDecodeColumn( auto int_column = ColumnOptCreateTraits::create_decoding( origin_len / sizeof(int64), std::move(decoding_option)); - int_column->Set(buffer_for_read); + int_column->Set(std::move(buffer_for_read)); column_rc = std::move(int_column); } else { auto int_column = ColumnOptCreateTraits::create_decoding( origin_len / sizeof(int64), std::move(decoding_option)); - int_column->Set(buffer_for_read, column_not_nulls); + int_column->Set(std::move(buffer_for_read), column_not_nulls); column_rc = std::move(int_column); } break; @@ -697,7 +697,6 @@ TEST_P(PaxNonFixedColumnCompressTest, auto number = ::testing::get<0>(GetParam()); auto kind = ::testing::get<1>(GetParam()); auto verify_range = ::testing::get<2>(GetParam()); - auto enable_offsets_encoding = ::testing::get<2>(GetParam()); const size_t number_of_rows = 1024; PaxEncoder::EncodingOption encoding_option; @@ -705,10 +704,9 @@ TEST_P(PaxNonFixedColumnCompressTest, encoding_option.compress_level = 5; encoding_option.is_sign = true; - if (enable_offsets_encoding) { - encoding_option.offsets_encode_type = kind; - encoding_option.offsets_compress_level = 5; - } + encoding_option.offsets_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + encoding_option.offsets_compress_level = 5; non_fixed_column = new PaxNonFixedEncodingColumn( number_of_rows, number_of_rows, std::move(encoding_option)); @@ -744,22 +742,21 @@ TEST_P(PaxNonFixedColumnCompressTest, decoding_option.is_sign = true; decoding_option.compress_level = 5; - if (enable_offsets_encoding) { - decoding_option.offsets_encode_type = kind; - decoding_option.offsets_compress_level = 5; - } + decoding_option.offsets_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + decoding_option.offsets_compress_level = 5; auto non_fixed_column_for_read = new PaxNonFixedEncodingColumn( number_of_rows * number, sizeof(int32) * number_of_rows, std::move(decoding_option)); - auto data_buffer_for_read = std::make_shared>( + auto data_buffer_for_read = std::make_unique>( encoded_buff, encoded_len, false, false); data_buffer_for_read->Brush(encoded_len); - auto length_buffer_cpy = std::make_shared>( + auto length_buffer_cpy = std::make_unique>( (int32 *)offset_stream_buff, offset_stream_len, false, false); length_buffer_cpy->BrushAll(); - non_fixed_column_for_read->Set(data_buffer_for_read, length_buffer_cpy, - origin_len); + non_fixed_column_for_read->Set(std::move(data_buffer_for_read), + std::move(length_buffer_cpy), origin_len); ASSERT_EQ(non_fixed_column_for_read->GetCompressLevel(), 5); char *verify_buff; size_t verify_len; @@ -801,6 +798,9 @@ INSTANTIATE_TEST_SUITE_P( PaxColumnEncodingTestCombine, PaxColumnCompressTest, testing::Combine(testing::Values(16, 32, 64), testing::Values(ColumnEncoding_Kind_NO_ENCODED, +#ifdef USE_LZ4 + ColumnEncoding_Kind_COMPRESS_LZ4, +#endif ColumnEncoding_Kind_COMPRESS_ZSTD, ColumnEncoding_Kind_COMPRESS_ZLIB))); @@ -808,6 +808,9 @@ INSTANTIATE_TEST_SUITE_P( PaxColumnEncodingTestCombine, PaxNonFixedColumnCompressTest, testing::Combine(testing::Values(16, 32, 64), testing::Values(ColumnEncoding_Kind_NO_ENCODED, +#ifdef USE_LZ4 + ColumnEncoding_Kind_COMPRESS_LZ4, +#endif ColumnEncoding_Kind_COMPRESS_ZSTD, ColumnEncoding_Kind_COMPRESS_ZLIB), testing::Values(true, false), diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_columns.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_columns.cc index 4a181947ed8..7fd3c04ae60 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_columns.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_columns.cc @@ -377,7 +377,7 @@ size_t PaxColumns::MeasureVecDataBuffer( // has null will generate a bitmap in current stripe if (column->HasNull()) { - auto bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); Assert(bm); size_t bm_length = bm->MinimalStoredBytes(total_rows); @@ -483,7 +483,7 @@ size_t PaxColumns::MeasureOrcDataBuffer( auto column = p_column.get(); // has null will generate a bitmap in current stripe if (column->HasNull()) { - auto bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); Assert(bm); size_t bm_length = bm->MinimalStoredBytes(column->GetRows()); buffer_len += bm_length; @@ -592,7 +592,7 @@ void PaxColumns::CombineVecDataBuffer() { auto column = p_column.get(); if (column->HasNull()) { - auto bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); Assert(bm); auto nbytes = bm->MinimalStoredBytes(column->GetRows()); Assert(nbytes <= bm->Raw().size); @@ -680,7 +680,7 @@ void PaxColumns::CombineOrcDataBuffer() { auto column = p_column.get(); if (column->HasNull()) { - auto bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); Assert(bm); auto nbytes = bm->MinimalStoredBytes(column->GetRows()); Assert(nbytes <= bm->Raw().size); diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_compress.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_compress.cc index 87a34cbb6d7..f4bae52ea7d 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_compress.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_compress.cc @@ -50,6 +50,12 @@ std::shared_ptr PaxCompressor::CreateBlockCompressor( compressor = std::make_shared(); break; } +#ifdef USE_LZ4 + case ColumnEncoding_Kind::ColumnEncoding_Kind_COMPRESS_LZ4: { + compressor = std::make_shared(); + break; + } +#endif case ColumnEncoding_Kind::ColumnEncoding_Kind_DEF_ENCODED: { CBDB_RAISE(cbdb::CException::ExType::kExTypeLogicError, fmt("Invalid compress type %d", @@ -230,9 +236,12 @@ size_t PaxLZ4Compressor::GetCompressBound(size_t src_len) { } size_t PaxLZ4Compressor::Compress(void *dst_buff, size_t dst_cap, - void *src_buff, size_t src_len, int /*lvl*/) { - return LZ4_compress_default((char *)src_buff, (char *)dst_buff, src_len, - dst_cap); + void *src_buff, size_t src_len, int lvl) { + // acceleration affects compression speed, the larger acceleration value, + // the less compression ratio. + int acceleration = (20 - lvl) / 6; + return LZ4_compress_fast((char *)src_buff, (char *)dst_buff, src_len, + dst_cap, acceleration); } size_t PaxLZ4Compressor::Decompress(void *dst_buff, size_t dst_len, diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_compress_bench.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_compress_bench.cc new file mode 100644 index 00000000000..0a792601e99 --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_compress_bench.cc @@ -0,0 +1,421 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * pax_compress_bench.cc + * + * IDENTIFICATION + * contrib/pax_storage/src/cpp/storage/columns/pax_compress_bench.cc + * + *------------------------------------------------------------------------- + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "comm/cbdb_wrappers.h" +#include "comm/pax_memory.h" +#include "pax_gbench.h" +#include "storage/columns/pax_compress.h" +#include "storage/columns/pax_decoding.h" +#include "storage/columns/pax_delta_encoding.h" +#include "storage/columns/pax_rlev2_encoding.h" +#include "storage/pax_buffer.h" + +namespace pax::bench { + +namespace { + +// Test data and prebuilt buffers for decode/decompress benchmarks +static const size_t kCount = 1024 * 1024; +static std::vector g_offsets; +static std::unique_ptr g_raw_bytes; +static size_t g_raw_len = 0; + +static std::vector g_rle_encoded; +static size_t g_rle_len = 0; + +static std::vector g_delta_encoded; +static size_t g_delta_len = 0; + +static std::unique_ptr g_zstd_compressed; +static size_t g_zstd_len = 0; + +static std::shared_ptr g_zstd; + +// Simple helpers for bench data persistence +static void EnsureDirExists(const char *dir_path) { + if (mkdir(dir_path, 0755) != 0) { + if (errno != EEXIST) { + std::cerr << "Failed to create directory: " << dir_path << std::endl; + std::abort(); + } + } +} + +static bool ReadWholeFile(const char *path, std::vector &out) { + std::ifstream in(path, std::ios::binary); + if (!in.is_open()) return false; + in.seekg(0, std::ios::end); + std::streampos size = in.tellg(); + if (size <= 0) return false; + out.resize(static_cast(size)); + in.seekg(0, std::ios::beg); + in.read(out.data(), size); + return static_cast(in); +} + +static bool ReadWholeFile(const char *path, std::unique_ptr &out, + size_t &out_len) { + std::ifstream in(path, std::ios::binary); + if (!in.is_open()) return false; + in.seekg(0, std::ios::end); + std::streampos size = in.tellg(); + if (size <= 0) return false; + out_len = static_cast(size); + out = std::make_unique(out_len); + in.seekg(0, std::ios::beg); + in.read(out.get(), size); + return static_cast(in); +} + +static void WriteWholeFile(const char *path, const char *data, size_t len) { + std::ofstream out(path, std::ios::binary | std::ios::trunc); + if (!out.is_open()) { + std::cerr << "Failed to open file for write: " << path << std::endl; + std::abort(); + } + out.write(data, static_cast(len)); + if (!out) { + std::cerr << "Failed to write file: " << path << std::endl; + std::abort(); + } +} + +static const char *kBenchDataDir = "bench_data"; +static const char *kRLEV2Path = "bench_data/rle_v2_u32.bin"; +static const char *kDeltaPath = "bench_data/delta_u32.bin"; +static const char *kZSTDPath = "bench_data/zstd_u32.bin"; +static const char *kRawPath = "bench_data/raw_u32.bin"; + +static std::vector GenerateMonotonicOffsets(size_t n, uint32_t seed) { + std::vector offsets; + offsets.resize(n); + offsets[0] = 0; + std::mt19937 rng(seed); + std::uniform_int_distribution step_dist(1, 256); + for (size_t i = 1; i < n; ++i) { + offsets[i] = offsets[i - 1] + static_cast(step_dist(rng)); + } + return offsets; +} + +// Lazily ensure raw bytes are available (prefer loading from disk) +static void EnsureRawData() { + if (g_raw_len != 0 && g_raw_bytes) return; + EnsureDirExists(kBenchDataDir); + std::vector raw_from_file; + if (ReadWholeFile(kRawPath, raw_from_file)) { + g_raw_len = raw_from_file.size(); + g_raw_bytes = std::make_unique(g_raw_len); + std::memcpy(g_raw_bytes.get(), raw_from_file.data(), g_raw_len); + return; + } + // Fallback: generate and persist + g_offsets = GenerateMonotonicOffsets(kCount, /*seed=*/12345); + g_raw_len = g_offsets.size() * sizeof(uint32_t); + g_raw_bytes = std::make_unique(g_raw_len); + std::memcpy(g_raw_bytes.get(), g_offsets.data(), g_raw_len); + WriteWholeFile(kRawPath, g_raw_bytes.get(), g_raw_len); +} + +// Lazily ensure RLEv2 encoded buffer exists (load or build from raw) +static void EnsureRleEncoded() { + if (g_rle_len != 0 && !g_rle_encoded.empty()) return; + EnsureDirExists(kBenchDataDir); + if (ReadWholeFile(kRLEV2Path, g_rle_encoded)) { + g_rle_len = g_rle_encoded.size(); + return; + } + EnsureRawData(); + PaxEncoder::EncodingOption enc_opt; + enc_opt.column_encode_type = ColumnEncoding_Kind_RLE_V2; + enc_opt.is_sign = false; + + PaxOrcEncoder rle_encoder(enc_opt); + auto rle_out = std::make_shared>(g_raw_len); + rle_encoder.SetDataBuffer(rle_out); + // encode directly from raw bytes to avoid depending on g_offsets + size_t count = g_raw_len / sizeof(uint32_t); + const uint32_t *vals = reinterpret_cast(g_raw_bytes.get()); + for (size_t i = 0; i < count; ++i) { + uint32_t v = vals[i]; + rle_encoder.Append(reinterpret_cast(&v), sizeof(uint32_t)); + } + rle_encoder.Flush(); + + g_rle_len = rle_encoder.GetBufferSize(); + g_rle_encoded.assign(rle_encoder.GetBuffer(), + rle_encoder.GetBuffer() + g_rle_len); + WriteWholeFile(kRLEV2Path, g_rle_encoded.data(), g_rle_len); +} + +// Lazily ensure Delta encoded buffer exists (load or build from raw) +static void EnsureDeltaEncoded() { + if (g_delta_len != 0 && !g_delta_encoded.empty()) return; + EnsureDirExists(kBenchDataDir); + if (ReadWholeFile(kDeltaPath, g_delta_encoded)) { + g_delta_len = g_delta_encoded.size(); + return; + } + EnsureRawData(); + PaxEncoder::EncodingOption enc_opt; + enc_opt.is_sign = false; + // type not used by PaxDeltaEncoder + PaxDeltaEncoder delta_encoder(enc_opt); + auto delta_out = std::make_shared>(g_raw_len); + delta_encoder.SetDataBuffer(delta_out); + // Encode whole array in one shot + delta_encoder.Append(g_raw_bytes.get(), g_raw_len); + delta_encoder.Flush(); + + g_delta_len = delta_encoder.GetBufferSize(); + g_delta_encoded.assign(delta_encoder.GetBuffer(), + delta_encoder.GetBuffer() + g_delta_len); + WriteWholeFile(kDeltaPath, g_delta_encoded.data(), g_delta_len); +} + +// Lazily ensure ZSTD compressed buffer exists (load or build from raw) +static void EnsureZstdCompressed() { + EnsureDirExists(kBenchDataDir); + if (!g_zstd) { + g_zstd = + PaxCompressor::CreateBlockCompressor(ColumnEncoding_Kind_COMPRESS_ZSTD); + if (!g_zstd) { + std::cerr << "Failed to create ZSTD compressor" << std::endl; + std::abort(); + } + } + if (g_zstd_len != 0 && g_zstd_compressed) return; + if (ReadWholeFile(kZSTDPath, g_zstd_compressed, g_zstd_len)) { + return; + } + EnsureRawData(); + size_t bound = g_zstd->GetCompressBound(g_raw_len); + g_zstd_compressed = std::make_unique(bound); + g_zstd_len = g_zstd->Compress(g_zstd_compressed.get(), bound, + g_raw_bytes.get(), g_raw_len, /*lvl=*/5); + if (g_zstd->IsError(g_zstd_len) || g_zstd_len == 0) { + std::cerr << "ZSTD one-time compress failed" << std::endl; + std::abort(); + } + WriteWholeFile(kZSTDPath, g_zstd_compressed.get(), g_zstd_len); +} + +static void PrepareOnce() { + pax::bench::CreateMemoryContext(); + EnsureDirExists(kBenchDataDir); +} + +static void CleanupBenchData() { + const char *files[] = {kRLEV2Path, kDeltaPath, kZSTDPath, kRawPath}; + for (const char *p : files) { + std::remove(p); + } + + rmdir(kBenchDataDir); +} + +} // namespace + +// Register module init with gbench framework +REGISTER_BENCHMARK_INIT(PrepareOnce); +REGISTER_BENCHMARK_CLEANUP(CleanupBenchData); + +// RLEv2 encode benchmark +static void BM_RLEV2_Encode(::benchmark::State &state) { + // Prepare raw data only; no encoded buffers are created here + EnsureRawData(); + for (auto _ : state) { + PaxEncoder::EncodingOption enc_opt; + enc_opt.column_encode_type = ColumnEncoding_Kind_RLE_V2; + enc_opt.is_sign = false; + + PaxOrcEncoder encoder(enc_opt); + auto out = std::make_shared>(g_raw_len); + encoder.SetDataBuffer(out); + + size_t count = g_raw_len / sizeof(uint32_t); + const uint32_t *vals = + reinterpret_cast(g_raw_bytes.get()); + for (size_t i = 0; i < count; ++i) { + uint32_t v = vals[i]; + encoder.Append(reinterpret_cast(&v), sizeof(uint32_t)); + } + encoder.Flush(); + g_rle_len = encoder.GetBufferSize(); + benchmark::DoNotOptimize(encoder.GetBuffer()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(static_cast(state.iterations()) * + static_cast(g_raw_len)); + state.counters["raw_kb"] = + benchmark::Counter(static_cast(g_raw_len) / (1024.0)); + state.counters["rle_kb"] = + benchmark::Counter(static_cast(g_rle_len) / (1024.0)); +} +BENCHMARK(BM_RLEV2_Encode); + +// RLEv2 decode benchmark +static void BM_RLEV2_Decode(::benchmark::State &state) { + // Ensure we have raw size and encoded buffer ready (prefer from disk) + EnsureRawData(); + EnsureRleEncoded(); + for (auto _ : state) { + PaxDecoder::DecodingOption dec_opt; + dec_opt.column_encode_type = ColumnEncoding_Kind_RLE_V2; + dec_opt.is_sign = false; + + auto decoder = PaxDecoder::CreateDecoder(dec_opt); + auto out = std::make_shared>(g_raw_len); + decoder->SetSrcBuffer(g_rle_encoded.data(), g_rle_len); + decoder->SetDataBuffer(out); + size_t n = decoder->Decoding(); + benchmark::DoNotOptimize(n); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(static_cast(state.iterations()) * + static_cast(g_raw_len)); +} +BENCHMARK(BM_RLEV2_Decode); + +// Delta encode benchmark +static void BM_Delta_Encode(::benchmark::State &state) { + EnsureRawData(); + for (auto _ : state) { + PaxEncoder::EncodingOption enc_opt; + enc_opt.is_sign = false; + PaxDeltaEncoder encoder(enc_opt); + auto out = std::make_shared>(g_raw_len); + encoder.SetDataBuffer(out); + encoder.Append(g_raw_bytes.get(), g_raw_len); + encoder.Flush(); + g_delta_len = encoder.GetBufferSize(); + benchmark::DoNotOptimize(encoder.GetBuffer()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(static_cast(state.iterations()) * + static_cast(g_raw_len)); + state.counters["delta_kb"] = + benchmark::Counter(static_cast(g_delta_len) / (1024.0)); +} +BENCHMARK(BM_Delta_Encode); + +// Delta decode benchmark +static void BM_Delta_Decode(::benchmark::State &state) { + EnsureRawData(); + EnsureDeltaEncoded(); + for (auto _ : state) { + PaxDecoder::DecodingOption dec_opt; + dec_opt.is_sign = false; + dec_opt.column_encode_type = ColumnEncoding_Kind_DIRECT_DELTA; + PaxDeltaDecoder decoder(dec_opt); + auto out = std::make_shared>(g_raw_len); + decoder.SetSrcBuffer(g_delta_encoded.data(), g_delta_len); + decoder.SetDataBuffer(out); + size_t n = decoder.Decoding(); + if (n != g_raw_len / sizeof(uint32_t) && out->Used() != g_raw_len) { + std::cerr << "Delta decode failed, n: " << n + << ", g_raw_len: " << g_raw_len + << ", g_delta_len: " << g_delta_len + << ", out: Used: " << out->Used() << std::endl; + std::abort(); + } + + if (memcmp(out->GetBuffer(), g_raw_bytes.get(), g_raw_len) != 0) { + std::cerr << "Delta decode failed, out: " << out->GetBuffer() + << ", g_raw_bytes: " << g_raw_bytes.get() << std::endl; + std::abort(); + } + + benchmark::DoNotOptimize(n); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(static_cast(state.iterations()) * + static_cast(g_raw_len)); +} +BENCHMARK(BM_Delta_Decode); + +// ZSTD compress benchmark +static void BM_ZSTD_Compress(::benchmark::State &state) { + EnsureRawData(); + if (!g_zstd) { + g_zstd = + PaxCompressor::CreateBlockCompressor(ColumnEncoding_Kind_COMPRESS_ZSTD); + if (!g_zstd) { + std::cerr << "Failed to create ZSTD compressor" << std::endl; + std::abort(); + } + } + size_t bound = g_zstd->GetCompressBound(g_raw_len); + std::unique_ptr dst(new char[bound]); + for (auto _ : state) { + size_t n = g_zstd->Compress(dst.get(), bound, g_raw_bytes.get(), g_raw_len, + /*lvl=*/5); + g_zstd_len = n; + benchmark::DoNotOptimize(n); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(static_cast(state.iterations()) * + static_cast(g_raw_len)); + state.counters["zstd_kb"] = + benchmark::Counter(static_cast(g_zstd_len) / (1024.0)); +} +BENCHMARK(BM_ZSTD_Compress); + +// ZSTD decompress benchmark +static void BM_ZSTD_Decompress(::benchmark::State &state) { + EnsureRawData(); + EnsureZstdCompressed(); + std::unique_ptr dst(new char[g_raw_len]); + for (auto _ : state) { + size_t n = g_zstd->Decompress(dst.get(), g_raw_len, g_zstd_compressed.get(), + g_zstd_len); + benchmark::DoNotOptimize(n); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(static_cast(state.iterations()) * + static_cast(g_raw_len)); +} +BENCHMARK(BM_ZSTD_Decompress); + +} // namespace pax::bench diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_decoding.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_decoding.cc index 7ba0fcd6768..0e15ec52088 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_decoding.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_decoding.cc @@ -31,6 +31,7 @@ #include "comm/pax_memory.h" #include "storage/columns/pax_dict_encoding.h" #include "storage/columns/pax_rlev2_decoding.h" +#include "storage/columns/pax_delta_encoding.h" namespace pax { @@ -47,7 +48,7 @@ std::shared_ptr PaxDecoder::CreateDecoder(const DecodingOption &deco break; } case ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA: { - /// TODO(jiaqizho) support it + decoder = std::make_shared>(decoder_options); break; } case ColumnEncoding_Kind::ColumnEncoding_Kind_DICTIONARY: { diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding.cc new file mode 100644 index 00000000000..3f4b5341c4a --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding.cc @@ -0,0 +1,511 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * pax_delta_encoding.cc + * + * IDENTIFICATION + * contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding.cc + * + *------------------------------------------------------------------------- + */ +#include "storage/columns/pax_delta_encoding.h" + +#include +#include +#include + +namespace pax { + +// delta bitpack encoder +template +PaxDeltaEncoder::PaxDeltaEncoder(const EncodingOption &encoder_options) + : PaxEncoder(encoder_options) {} + +template +void PaxDeltaEncoder::Append(char *data, size_t size) { + CBDB_CHECK(!has_append_, cbdb::CException::kExTypeAbort, + fmt("PaxDeltaEncoder::Append only support Append Once")); + has_append_ = true; + + auto T_data = reinterpret_cast(data); + auto T_data_len = size / sizeof(T); + Encode(T_data, T_data_len); +} + +inline uint8_t NumBitsAllowZero(uint32_t value) { + if (value == 0) return 0; + uint8_t bits = 0; + while (value) { + bits++; + value >>= 1; + } + return bits; +} + +// Fast bit width calculation (0 -> 0) +inline static uint8_t FastNumBits(uint32_t v) { +#if defined(__GNUC__) || defined(__clang__) + return v == 0 ? 0 : static_cast(32 - __builtin_clz(v)); +#else + uint8_t bits = 0; + while (v) { + ++bits; + v >>= 1; + } + return bits; +#endif +} + +// 64-bit bit writer based on raw pointer (writes to reserved DataBuffer range) +struct BitWriter64Ptr { + uint8_t *out; + size_t index; + uint64_t bit_buffer; + uint32_t bit_count; + + BitWriter64Ptr(uint8_t *p) : out(p), index(0), bit_buffer(0), bit_count(0) {} + + inline void Append(uint32_t value, uint8_t width) { + if (width == 0) return; + bit_buffer |= (static_cast(value) << bit_count); + bit_count += width; + while (bit_count >= 8) { + out[index++] = static_cast(bit_buffer & 0xFF); + bit_buffer >>= 8; + bit_count -= 8; + } + } + + inline void FlushToByte() { + if (bit_count > 0) { + out[index++] = static_cast(bit_buffer & 0xFF); + bit_buffer = 0; + bit_count = 0; + } + } +}; + +// 64-bit bit reader based on raw pointer (limited to specified payload bytes) +struct BitReader64Ptr { + const uint8_t *in; + size_t size; + size_t index; + uint64_t bit_buffer; + uint32_t bit_count; + + BitReader64Ptr(const uint8_t *p, size_t len) + : in(p), size(len), index(0), bit_buffer(0), bit_count(0) {} + + inline void Ensure(uint32_t need_bits) { + while (bit_count < need_bits && index < size) { + bit_buffer |= (static_cast(in[index]) << bit_count); + ++index; + bit_count += 8; + } + } + + inline uint32_t Read(uint8_t width) { + if (width == 0) return 0; + Ensure(width); + uint32_t result; + if (width == 32) + result = static_cast(bit_buffer & 0xFFFFFFFFull); + else + result = static_cast(bit_buffer & ((1ull << width) - 1)); + bit_buffer >>= width; + bit_count -= width; + return result; + } + + inline void AlignToByte() { + uint32_t drop = bit_count % 8; + if (drop) { + bit_buffer >>= drop; + bit_count -= drop; + } + } +}; + +/* +Overall layout: + DeltaBlockHeader (struct, fixed-size) + - uint32 value_per_block + - uint32 values_per_mini_block + - uint32 total_count + T first_value + [Repeated Block until total_count is exhausted] + - uint32 min_delta + - uint8 bit_widths[ mini_blocks_per_block ] + - uint8 payload[computed from bit_widths] + // bit-packed adjusted deltas, mini-block by mini-block + // within a block: bits are written MSB-first, end aligned to byte +*/ + +template +size_t PaxDeltaEncoder::GetBoundSize(size_t src_len) const { + size_t value_count = src_len / sizeof(T); + size_t block_count = (value_count + value_per_block_ - 1) / value_per_block_; + /* header + first_value + block_count * (min_delta + bit_widths ) + * + payload was eliminated to value_count*/ + return sizeof(DeltaBlockHeader) + sizeof(T) + + block_count * (sizeof(uint32) + mini_blocks_per_block_) + value_count; +} + +template +void PaxDeltaEncoder::Encode(T *data, size_t count) { + // Estimate allocation: by element byte count, sufficient to accommodate + // header and bit stream + if (result_buffer_->Capacity() < + count * sizeof(T) + sizeof(DeltaBlockHeader) + sizeof(T)) { + result_buffer_->ReSize(count * sizeof(T) + sizeof(DeltaBlockHeader) + + sizeof(T)); + } + + DeltaBlockHeader header; + header.value_per_block = value_per_block_; + header.values_per_mini_block = values_per_mini_block_; + header.total_count = count; + // add delta block header + result_buffer_->Write(reinterpret_cast(&header), sizeof(header)); + result_buffer_->Brush(sizeof(header)); + // add base value + result_buffer_->Write(reinterpret_cast(&data[0]), sizeof(data[0])); + result_buffer_->Brush(sizeof(data[0])); + + size_t values_emitted = 1; + T previous_value = data[0]; + + while (values_emitted < count) { + uint32_t values_in_block = std::min( + value_per_block_, static_cast(count - values_emitted)); + + if (deltas_scratch_.size() < values_in_block) { + deltas_scratch_.resize(values_in_block); + } + uint32_t *deltas = deltas_scratch_.data(); + uint32_t min_delta = UINT32_MAX; + uint32_t mini_max[mini_blocks_per_block_] = {0}; + + for (uint32_t i = 0; i < values_in_block; ++i) { + T current = data[values_emitted + i]; + uint32_t delta = static_cast(current - previous_value); + deltas[i] = delta; + previous_value = current; + if (delta < min_delta) min_delta = delta; + uint32_t mini_index = i / values_per_mini_block_; + if (delta > mini_max[mini_index]) mini_max[mini_index] = delta; + } + + // write block header: min_delta later + uint8_t bit_widths[mini_blocks_per_block_] = {0}; + uint64_t total_bits = 0; + for (uint32_t i = 0; i < mini_blocks_per_block_; ++i) { + uint32_t start = i * values_per_mini_block_; + if (start >= values_in_block) { + bit_widths[i] = 0; + continue; + } + uint32_t adjusted_max = mini_max[i] - min_delta; + uint8_t w = FastNumBits(adjusted_max); + bit_widths[i] = w; + uint32_t end = std::min(start + values_per_mini_block_, values_in_block); + total_bits += static_cast(w) * (end - start); + } + uint32_t payload_bytes = static_cast((total_bits + 7) / 8); + + size_t need_size = + payload_bytes + mini_blocks_per_block_ + sizeof(min_delta); + + // Grows the buffer to be at least need_size bytes. To avoid frequent + // resizing, the new capacity is calculated as the maximum of (current + // capacity * 1.5) or (current capacity + need_size). + if (result_buffer_->Available() < need_size) { + size_t inc_size = need_size > (result_buffer_->Capacity() * 0.5) + ? need_size + : result_buffer_->Capacity() * 0.5; + result_buffer_->ReSize(result_buffer_->Capacity() + inc_size); + } + + // write block header: min_delta + result_buffer_->Write(reinterpret_cast(&min_delta), + sizeof(min_delta)); + result_buffer_->Brush(sizeof(min_delta)); + + // write bit_widths + result_buffer_->Write(reinterpret_cast(bit_widths), + mini_blocks_per_block_); + result_buffer_->Brush(mini_blocks_per_block_); + + uint8_t *payload_ptr = + reinterpret_cast(result_buffer_->GetAvailableBuffer()); + BitWriter64Ptr bw(payload_ptr); + for (uint32_t i = 0; i < mini_blocks_per_block_; ++i) { + uint32_t start = i * values_per_mini_block_; + if (start >= values_in_block) break; + uint32_t end = std::min(start + values_per_mini_block_, values_in_block); + uint8_t w = bit_widths[i]; + if (w == 0) continue; + for (uint32_t j = start; j < end; ++j) { + uint32_t adjusted = deltas[j] - min_delta; + bw.Append(adjusted, w); + } + } + bw.FlushToByte(); + result_buffer_->Brush(payload_bytes); + + values_emitted += values_in_block; + } +} + +template +bool PaxDeltaEncoder::SupportAppendNull() const { + return false; +} + +template +void PaxDeltaEncoder::Flush() { + // do nothing +} + +// Specialized reading of one mini-block and batch writing results +// (BitReader64Ptr) +template +inline void ReadMiniBlockSpecializedPtr(BitReader64Ptr &br, T *out_values, + T ¤t_value, uint32_t count_in_mb, + uint32_t min_delta, uint8_t w) { + switch (w) { + case 0: { + for (uint32_t j = 0; j < count_in_mb; ++j) { + current_value = + static_cast(static_cast(current_value) + min_delta); + out_values[j] = current_value; + } + return; + } + case 8: { + for (uint32_t j = 0; j < count_in_mb; ++j) { + uint32_t adjusted = br.Read(8); + current_value = static_cast(static_cast(current_value) + + adjusted + min_delta); + out_values[j] = current_value; + } + return; + } + case 16: { + for (uint32_t j = 0; j < count_in_mb; ++j) { + uint32_t adjusted = br.Read(16); + current_value = static_cast(static_cast(current_value) + + adjusted + min_delta); + out_values[j] = current_value; + } + return; + } + case 32: { + for (uint32_t j = 0; j < count_in_mb; ++j) { + uint32_t adjusted = br.Read(32); + current_value = static_cast(static_cast(current_value) + + adjusted + min_delta); + out_values[j] = current_value; + } + return; + } + default: { + uint32_t j = 0; + const uint32_t n4 = count_in_mb & ~3u; + for (; j < n4; j += 4) { + uint32_t a0 = br.Read(w); + uint32_t a1 = br.Read(w); + uint32_t a2 = br.Read(w); + uint32_t a3 = br.Read(w); + current_value = static_cast(static_cast(current_value) + + a0 + min_delta); + out_values[j] = current_value; + current_value = static_cast(static_cast(current_value) + + a1 + min_delta); + out_values[j + 1] = current_value; + current_value = static_cast(static_cast(current_value) + + a2 + min_delta); + out_values[j + 2] = current_value; + current_value = static_cast(static_cast(current_value) + + a3 + min_delta); + out_values[j + 3] = current_value; + } + for (; j < count_in_mb; ++j) { + uint32_t a = br.Read(w); + current_value = static_cast(static_cast(current_value) + + a + min_delta); + out_values[j] = current_value; + } + return; + } + } +} + +// Specialized reading of one mini-block and batch writing results +template +PaxDeltaDecoder::PaxDeltaDecoder( + const PaxDecoder::DecodingOption &encoder_options) + : PaxDecoder(encoder_options), + data_buffer_(nullptr), + result_buffer_(nullptr) { + CBDB_CHECK(encoder_options.column_encode_type == + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA, + cbdb::CException::kExTypeAbort, + fmt("PaxDeltaDecoder only support DIRECT_DELTA encoding")); + // TODO: if sign is true, should use zigzag encoding, now use delta encoding + // for offsets in non-fixed columns + CBDB_CHECK(encoder_options.is_sign == false, + cbdb::CException::kExTypeUnImplements, + fmt("PaxDeltaDecoder is not supported for signed data, " + "will support zigzag later")); +} + +template +PaxDecoder *PaxDeltaDecoder::SetSrcBuffer(char *data, size_t data_len) { + if (data) { + data_buffer_ = + std::make_shared>(data, data_len, false, false); + data_buffer_->Brush(data_len); + } + return this; +} + +template +PaxDecoder *PaxDeltaDecoder::SetDataBuffer( + std::shared_ptr> result_buffer) { + result_buffer_ = result_buffer; + return this; +} + +template +const char *PaxDeltaDecoder::GetBuffer() const { + return result_buffer_ ? result_buffer_->GetBuffer() : nullptr; +} + +template +size_t PaxDeltaDecoder::GetBufferSize() const { + return result_buffer_ ? result_buffer_->Used() : 0; +} + +template +size_t PaxDeltaDecoder::Next(const char * /*not_null*/) { + CBDB_RAISE(cbdb::CException::kExTypeUnImplements); +} + +template +size_t PaxDeltaDecoder::Decoding() { + if (!data_buffer_) return 0; + Assert(result_buffer_); + + const uint8_t *p = + reinterpret_cast(data_buffer_->GetBuffer()); + uint32_t remaining = static_cast(data_buffer_->Used()); + + // read header: values_per_block, values_per_mini_block_, total_count, + // first_value + DeltaBlockHeader header; + std::memcpy(&header, p, sizeof(header)); + p += sizeof(header); + remaining -= sizeof(header); + uint32_t values_per_block = header.value_per_block; + uint32_t values_per_mini_block_ = header.values_per_mini_block; + uint32_t total_count = header.total_count; + + T first_value; + std::memcpy(&first_value, p, sizeof(T)); + p += sizeof(T); + remaining -= sizeof(T); + + // reserve output buffer + if (result_buffer_->Capacity() < total_count * sizeof(T)) { + result_buffer_->ReSize(total_count * sizeof(T)); + } + + // write first value + T current_value = static_cast(first_value); + result_buffer_->Write(reinterpret_cast(¤t_value), sizeof(T)); + result_buffer_->Brush(sizeof(T)); + uint32_t decoded = 1; + + const uint32_t mini_blocks_per_block_ = + values_per_block / values_per_mini_block_; + + while (decoded < total_count && remaining > 0) { + uint32_t min_delta; + std::memcpy(&min_delta, p, sizeof(min_delta)); + p += sizeof(min_delta); + remaining -= sizeof(min_delta); + + if (remaining < mini_blocks_per_block_) break; + + uint8_t bit_widths[mini_blocks_per_block_] = {0}; + for (uint32_t i = 0; i < mini_blocks_per_block_; ++i) { + bit_widths[i] = *p++; + --remaining; + } + + uint32_t values_in_block = + std::min(values_per_block, total_count - decoded); + + // read payload: initialize reader with remaining bytes; we'll compute + // consumed + BitReader64Ptr br(p, remaining); + + for (uint32_t i = 0; i < mini_blocks_per_block_ && decoded < total_count; + ++i) { + uint32_t start = i * values_per_mini_block_; + if (start >= values_in_block) break; + uint32_t end = std::min(start + values_per_mini_block_, values_in_block); + uint32_t cnt = end - start; + uint8_t w = bit_widths[i]; + + T *out_base = reinterpret_cast(result_buffer_->GetAvailableBuffer()); + ReadMiniBlockSpecializedPtr(br, out_base, current_value, cnt, + min_delta, w); + result_buffer_->Brush(cnt * sizeof(T)); + decoded += cnt; + } + + br.AlignToByte(); + + size_t consumed = br.index; + p += consumed; + remaining -= consumed; + } + + Assert(result_buffer_->Used() == total_count * sizeof(T)); + + return result_buffer_->Used(); +} + +template +size_t PaxDeltaDecoder::Decoding(const char * /*not_null*/, + size_t /*not_null_len*/) { + CBDB_RAISE(cbdb::CException::kExTypeUnImplements); +} + +template class PaxDeltaEncoder; +template class PaxDeltaDecoder; +// Add explicit instantiations for signed integral types used by CreateDecoder +template class PaxDeltaDecoder; +template class PaxDeltaDecoder; +template class PaxDeltaDecoder; +template class PaxDeltaDecoder; + +} // namespace pax \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding.h b/contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding.h new file mode 100644 index 00000000000..7f2251201bf --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding.h @@ -0,0 +1,135 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * pax_delta_encoding.h + * + * IDENTIFICATION + * contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding.h + * + *------------------------------------------------------------------------- + */ +#pragma once + +#include "storage/columns/pax_encoding.h" +#include "storage/columns/pax_decoding.h" +#include + +namespace pax { + +struct BitReader64 { + const uint8_t*& p; + uint32_t& remaining; + uint64_t bit_buffer = 0; + uint32_t bit_count = 0; + + BitReader64(const uint8_t*& ptr, uint32_t& size) : p(ptr), remaining(size) {} + + inline void Ensure(uint32_t need_bits) { + while (bit_count < need_bits && remaining > 0) { + bit_buffer |= (static_cast(*p) << bit_count); + ++p; + --remaining; + bit_count += 8; + } + } + + inline uint32_t Read(uint8_t width) { + if (width == 0) return 0; + Ensure(width); + uint32_t result; + if (width == 32) { + result = static_cast(bit_buffer & 0xFFFFFFFFull); + } else { + result = static_cast(bit_buffer & ((1ull << width) - 1)); + } + bit_buffer >>= width; + bit_count -= width; + return result; + } + + inline void AlignToByte() { + uint32_t drop = bit_count % 8; + if (drop) { + bit_buffer >>= drop; + bit_count -= drop; + } + } +}; + +struct DeltaBlockHeader { + uint32_t value_per_block; + uint32_t values_per_mini_block; + uint32_t total_count; +}; + +template +class PaxDeltaEncoder : public PaxEncoder { + public: + explicit PaxDeltaEncoder(const EncodingOption &encoder_options); + + virtual void Append(char *data, size_t size) override; + + virtual bool SupportAppendNull() const override; + + virtual void Flush() override; + + virtual size_t GetBoundSize(size_t src_len) const override; + + private: + + void Encode(T *data, size_t size); + + private: + static constexpr uint32_t value_per_block_ = 128; + static constexpr uint32_t mini_blocks_per_block_ = 4; + static constexpr uint32_t values_per_mini_block_ = + value_per_block_ / mini_blocks_per_block_; + + private: + bool has_append_ = false; + // Reusable working buffer to avoid per-block allocations during encoding + std::vector deltas_scratch_; +}; + +template +class PaxDeltaDecoder : public PaxDecoder { + public: + explicit PaxDeltaDecoder(const PaxDecoder::DecodingOption &encoder_options); + + virtual PaxDecoder *SetSrcBuffer(char *data, size_t data_len) override; + + virtual PaxDecoder *SetDataBuffer( + std::shared_ptr> result_buffer) override; + + virtual size_t Next(const char *not_null) override; + + virtual size_t Decoding() override; + + virtual size_t Decoding(const char *not_null, size_t not_null_len) override; + + virtual const char *GetBuffer() const override; + + virtual size_t GetBufferSize() const override; + + private: + std::shared_ptr> data_buffer_; + std::shared_ptr> result_buffer_; +}; + +} // namespace pax \ No newline at end of file diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding_test.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding_test.cc new file mode 100644 index 00000000000..031563381ee --- /dev/null +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding_test.cc @@ -0,0 +1,339 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * pax_delta_encoding_test.cc + * + * IDENTIFICATION + * contrib/pax_storage/src/cpp/storage/columns/pax_delta_encoding_test.cc + * + *------------------------------------------------------------------------- + */ + +#include "storage/columns/pax_delta_encoding.h" + +#include +#include + +#include "comm/gtest_wrappers.h" +#include "pax_gtest_helper.h" + +namespace pax { + +class PaxDeltaEncodingTest : public ::testing::Test { + protected: + void SetUp() override { + // Create encoding options + encoding_options_.column_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + encoding_options_.is_sign = false; + + // Create decoding options + decoding_options_.column_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + decoding_options_.is_sign = false; + } + + void TearDown() override {} + + // Fast bit width calculation (0 -> 0) + inline uint8_t FastNumBits(uint32_t v) { +#if defined(__GNUC__) || defined(__clang__) + return v == 0 ? 0 : static_cast(32 - __builtin_clz(v)); +#else + uint8_t bits = 0; + while (v) { + ++bits; + v >>= 1; + } + return bits; +#endif + } + + // Helper function to encode and decode data + template + std::vector EncodeAndDecode(const std::vector &input) { + // Create encoder + PaxDeltaEncoder encoder(encoding_options_); + + size_t bound_size = encoder.GetBoundSize(input.size() * sizeof(T)); + + encoder.SetDataBuffer(std::make_shared>(bound_size)); + + // Encode data + encoder.Append(reinterpret_cast(const_cast(input.data())), + input.size() * sizeof(T)); + + // Get encoded buffer + const char *encoded_data = encoder.GetBuffer(); + size_t encoded_size = encoder.GetBufferSize(); + + // Create decoder + PaxDeltaDecoder decoder(decoding_options_); + + // Set source buffer + decoder.SetSrcBuffer(const_cast(encoded_data), encoded_size); + + // Create result buffer + auto result_buffer = + std::make_shared>(input.size() * sizeof(T)); + decoder.SetDataBuffer(result_buffer); + + // Decode + size_t decoded_size = decoder.Decoding(); + + // Convert result back to vector + const T *decoded_data = reinterpret_cast(decoder.GetBuffer()); + size_t count = decoded_size / sizeof(T); + + return std::vector(decoded_data, decoded_data + count); + } + + PaxEncoder::EncodingOption encoding_options_; + PaxDecoder::DecodingOption decoding_options_; +}; + +// Test basic functionality +TEST_F(PaxDeltaEncodingTest, BasicEncodeDecode) { + std::vector input = {1, 2, 3, 4, 5}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test example from documentation - consecutive sequence +TEST_F(PaxDeltaEncodingTest, ConsecutiveSequence) { + std::vector input = {1, 2, 3, 4, 5}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); + + // Verify deltas would be [1, 1, 1, 1] with min_delta = 1 + // and adjusted deltas [0, 0, 0, 0] with bit_width = 0 +} + +// Test example from documentation - sequence with variation +TEST_F(PaxDeltaEncodingTest, SequenceWithVariation) { + std::vector input = {7, 5, 3, 1, 2, 3, 4, 5}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); + + // Verify deltas would be [-2, -2, -2, 1, 1, 1, 1] with min_delta = -2 + // Since we cast to uint32, -2 becomes a large positive number + // adjusted deltas would be [0, 0, 0, 3, 3, 3, 3] with bit_width = 2 +} + +// Test single value +TEST_F(PaxDeltaEncodingTest, SingleValue) { + std::vector input = {42}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test two values +TEST_F(PaxDeltaEncodingTest, TwoValues) { + std::vector input = {10, 15}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test large values +TEST_F(PaxDeltaEncodingTest, LargeValues) { + std::vector input = {1000000, 1000001, 1000002, 1000003}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test values with large deltas +TEST_F(PaxDeltaEncodingTest, LargeDeltas) { + std::vector input = {1, 1000, 2000, 3000}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test full block (128 values) +TEST_F(PaxDeltaEncodingTest, FullBlock) { + std::vector input; + for (uint32_t i = 0; i < 128; ++i) { + input.push_back(i); + } + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test multiple blocks +TEST_F(PaxDeltaEncodingTest, MultipleBlocks) { + std::vector input; + for (uint32_t i = 0; i < 250; ++i) { + input.push_back(i); + } + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test random data +TEST_F(PaxDeltaEncodingTest, RandomData) { + std::mt19937 gen(12345); + std::uniform_int_distribution dis(0, 1000000); + + std::vector input; + for (int i = 0; i < 100; ++i) { + input.push_back(dis(gen)); + } + + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test payload size calculation +TEST_F(PaxDeltaEncodingTest, PayloadSizeCalculation) { + std::vector input = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 56, 63, 89}; + // Test the specific example: deltas [0,0,0,0,0,0,0,0,...,0,22,6,25] with + // bit_width 0,5,0,0 + + PaxDeltaEncoder encoder(encoding_options_); + size_t bound_size = encoder.GetBoundSize(input.size() * sizeof(uint32_t)); + encoder.SetDataBuffer(std::make_shared>(bound_size)); + encoder.Append(reinterpret_cast(input.data()), + input.size() * sizeof(uint32_t)); + + // Verify the encoded data structure manually + const char *encoded_data = encoder.GetBuffer(); + size_t encoded_size = encoder.GetBufferSize(); + + EXPECT_GT(encoded_size, 0); + + // Parse the encoded data + const uint8_t *p = reinterpret_cast(encoded_data); + + // Read header + DeltaBlockHeader header; + std::memcpy(&header, p, sizeof(header)); + p += sizeof(header); + + EXPECT_EQ(header.value_per_block, 128); + EXPECT_EQ(header.values_per_mini_block, 32); + EXPECT_EQ(header.total_count, input.size()); + + // Read first value + uint32_t first_value; + std::memcpy(&first_value, p, sizeof(first_value)); + p += sizeof(first_value); + EXPECT_EQ(first_value, 1); + + // Read block data + uint32_t min_delta; + std::memcpy(&min_delta, p, sizeof(min_delta)); + p += sizeof(min_delta); + + // Read allbit widths + uint8_t bit_widths[4]; + for (int i = 0; i < 4; ++i) { + bit_widths[i] = *p++; + } + + // bit_widths should be [0, 6, 0, 0] + ASSERT_EQ(bit_widths[0], 0); + ASSERT_EQ(bit_widths[1], 5); + ASSERT_EQ(bit_widths[2], 0); + ASSERT_EQ(bit_widths[3], 0); + + // Compute payload size from bit_widths and counts + uint32_t values_in_block = + input.size() - 1; // we constructed input with 35 deltas in first block + uint64_t total_bits = 0; + for (uint32_t i = 0; i < 4; ++i) { + uint32_t start = i * 32; + if (start >= values_in_block) break; + uint32_t end = std::min(start + 32u, values_in_block); + uint8_t w = bit_widths[i]; + total_bits += static_cast(w) * (end - start); + } + uint32_t payload_size = static_cast((total_bits + 7) / 8); + + // For this example, we expect payload_size = 2 bytes + EXPECT_EQ(payload_size, 2); + + // Assert payload bitmap is correct + uint8_t payload[4]; + std::memcpy(payload, p, 4); + p += 4; + + // payload should be LSB-Last, value is(22,6,25) + // [0b10110, 0b00110, 0b11001] + EXPECT_EQ(payload[0], 0b11010110); + EXPECT_EQ(payload[1], 0b01100100); +} + +// Test bit width calculation helper +TEST_F(PaxDeltaEncodingTest, BitWidthCalculation) { + EXPECT_EQ(FastNumBits(0), 0); + EXPECT_EQ(FastNumBits(1), 1); + EXPECT_EQ(FastNumBits(2), 2); + EXPECT_EQ(FastNumBits(3), 2); + EXPECT_EQ(FastNumBits(4), 3); + EXPECT_EQ(FastNumBits(7), 3); + EXPECT_EQ(FastNumBits(8), 4); + EXPECT_EQ(FastNumBits(15), 4); + EXPECT_EQ(FastNumBits(16), 5); + EXPECT_EQ(FastNumBits(255), 8); + EXPECT_EQ(FastNumBits(256), 9); +} + +// Test zero deltas (all same values) +TEST_F(PaxDeltaEncodingTest, ZeroDeltas) { + std::vector input = {42, 42, 42, 42, 42}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test decreasing sequence (negative deltas) +TEST_F(PaxDeltaEncodingTest, DecreasingSequence) { + std::vector input = {100, 90, 80, 70, 60}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test mixed pattern +TEST_F(PaxDeltaEncodingTest, MixedPattern) { + std::vector input = {10, 20, 15, 25, 5, 30, 1, 35}; + auto output = EncodeAndDecode(input); + EXPECT_EQ(input, output); +} + +// Test empty input (edge case) +TEST_F(PaxDeltaEncodingTest, EmptyInput) { + std::vector input = {}; + // This should handle gracefully or throw expected exception + // For now, let's skip this test until we clarify expected behavior +} + +// Test different data types +TEST_F(PaxDeltaEncodingTest, DifferentTypes) { + // Test int32_t (with non-negative values) + std::vector input32 = {1, 2, 3, 4, 5}; + auto output32 = EncodeAndDecode(input32); + EXPECT_EQ(input32, output32); +} + +} // namespace pax + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_dict_encoding.h b/contrib/pax_storage/src/cpp/storage/columns/pax_dict_encoding.h index e552fa7a55a..38f3ba217db 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_dict_encoding.h +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_dict_encoding.h @@ -53,6 +53,10 @@ class PaxDictEncoder final : public PaxEncoder { void Flush() override; + size_t GetBoundSize(size_t src_len) const override { + CBDB_RAISE(cbdb::CException::kExTypeUnImplements); + } + private: size_t AppendInternal(char *data, size_t len); @@ -89,7 +93,8 @@ class PaxDictDecoder final : public PaxDecoder { PaxDecoder *SetSrcBuffer(char *data, size_t data_len) override; - PaxDecoder *SetDataBuffer(std::shared_ptr> result_buffer) override; + PaxDecoder *SetDataBuffer( + std::shared_ptr> result_buffer) override; const char *GetBuffer() const override; @@ -121,8 +126,8 @@ class PaxDictDecoder final : public PaxDecoder { buffer = src_buff->GetBuffer(); - index_buffer = - std::make_shared>((int32 *)buffer, head.indexsz, false, false); + index_buffer = std::make_shared>( + (int32 *)buffer, head.indexsz, false, false); index_buffer->BrushAll(); desc_buffer = std::make_shared>( @@ -130,8 +135,8 @@ class PaxDictDecoder final : public PaxDecoder { false); desc_buffer->BrushAll(); - entry_buffer = std::make_shared>(buffer + head.indexsz, head.dictsz, - false, false); + entry_buffer = std::make_shared>( + buffer + head.indexsz, head.dictsz, false, false); entry_buffer->BrushAll(); return std::make_tuple(index_buffer, entry_buffer, desc_buffer); diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding.cc index 3a354ceec8d..b11b2b7b6bd 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding.cc @@ -33,6 +33,7 @@ #include "comm/pax_memory.h" #include "storage/columns/pax_dict_encoding.h" #include "storage/columns/pax_rlev2_encoding.h" +#include "storage/columns/pax_delta_encoding.h" namespace pax { @@ -56,8 +57,7 @@ std::shared_ptr PaxEncoder::CreateStreamingEncoder( break; } case ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA: { - // TODO(jiaqizho): support direct delta encoding - // not support yet, then direct return a nullptr(means no encoding) + encoder = std::make_shared>(encoder_options); break; } case ColumnEncoding_Kind::ColumnEncoding_Kind_DEF_ENCODED: { diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding.h b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding.h index 362e68caa13..465c7bf0600 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding.h +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding.h @@ -75,6 +75,8 @@ class PaxEncoder { virtual size_t GetBufferSize() const; + virtual size_t GetBoundSize(size_t src_len) const = 0; + /** * steaming encoder * diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_column.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_column.cc index b59a5b879c1..0f93467812b 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_column.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_column.cc @@ -124,7 +124,7 @@ void PaxEncodingColumn::InitDecoder() { } template -void PaxEncodingColumn::Set(std::shared_ptr> data) { +void PaxEncodingColumn::Set(std::unique_ptr> data) { if (decoder_) { // should not decoding null if (data->Used() != 0) { @@ -155,7 +155,7 @@ void PaxEncodingColumn::Set(std::shared_ptr> data) { Assert(!data->IsMemTakeOver()); } else { - PaxCommColumn::Set(data); + PaxCommColumn::Set(std::move(data)); } } @@ -175,7 +175,7 @@ std::pair PaxEncodingColumn::GetBuffer() { if (encoder_) { // changed streaming encode to blocking encode // because we still need store a origin data in `PaxCommColumn` - auto origin_data_buffer = PaxCommColumn::data_; + auto origin_data_buffer = PaxCommColumn::data_.get(); shared_data_ = std::make_shared>(origin_data_buffer->Used()); encoder_->SetDataBuffer(shared_data_); diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_column.h b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_column.h index 773a7848f2f..5a5f1c378ba 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_column.h +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_column.h @@ -44,7 +44,7 @@ class PaxEncodingColumn : public PaxCommColumn { ~PaxEncodingColumn() override; - void Set(std::shared_ptr> data) override; + void Set(std::unique_ptr> data) override; std::pair GetBuffer() override; diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.cc index 25b6d2f1d6d..359a1c483a9 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.cc @@ -59,21 +59,37 @@ void PaxNonFixedEncodingColumn::InitEncoder() { } void PaxNonFixedEncodingColumn::InitOffsetStreamCompressor() { - Assert(encoder_options_.offsets_encode_type != - ColumnEncoding_Kind::ColumnEncoding_Kind_DEF_ENCODED); - offsets_compressor_ = PaxCompressor::CreateBlockCompressor( - encoder_options_.offsets_encode_type); + Assert(encoder_options_.offsets_encode_type == + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA); + SetOffsetsEncodeType(encoder_options_.offsets_encode_type); SetOffsetsCompressLevel(encoder_options_.offsets_compress_level); + + PaxEncoder::EncodingOption opt = encoder_options_; + opt.column_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + opt.is_sign = false; + // offsets are fixed-width, do not enable non_fixed streaming restriction + offsets_encoder_ = PaxEncoder::CreateStreamingEncoder(opt, false); } void PaxNonFixedEncodingColumn::InitOffsetStreamDecompressor() { Assert(decoder_options_.offsets_encode_type != ColumnEncoding_Kind::ColumnEncoding_Kind_DEF_ENCODED); - offsets_compressor_ = PaxCompressor::CreateBlockCompressor( - decoder_options_.offsets_encode_type); SetOffsetsEncodeType(decoder_options_.offsets_encode_type); SetOffsetsCompressLevel(decoder_options_.offsets_compress_level); + + if (decoder_options_.offsets_encode_type == + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA) { + PaxDecoder::DecodingOption temp_opt = decoder_options_; + temp_opt.column_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + temp_opt.is_sign = false; + offsets_decoder_ = PaxDecoder::CreateDecoder(temp_opt); + } else { + offsets_compressor_ = PaxCompressor::CreateBlockCompressor( + decoder_options_.offsets_encode_type); + } } void PaxNonFixedEncodingColumn::InitDecoder() { @@ -129,8 +145,8 @@ PaxNonFixedEncodingColumn::PaxNonFixedEncodingColumn( PaxNonFixedEncodingColumn::~PaxNonFixedEncodingColumn() {} -void PaxNonFixedEncodingColumn::Set(std::shared_ptr> data, - std::shared_ptr> offsets, +void PaxNonFixedEncodingColumn::Set(std::unique_ptr> data, + std::unique_ptr> offsets, size_t total_size) { bool exist_decoder; Assert(data && offsets); @@ -163,15 +179,19 @@ void PaxNonFixedEncodingColumn::Set(std::shared_ptr> data, // `data_` have the same buffer with `shared_data_` PaxNonFixedColumn::data_->Brush(shared_data_->Used()); // no delete the origin data - shared_data_ = data; + shared_data_ = std::move(data); } }; auto offsets_decompress = [&]() { Assert(!compress_route_); - Assert(offsets_compressor_); + Assert(offsets_compressor_ || offsets_decoder_); + + if (offsets->Used() == 0) { + return; + } - if (offsets->Used() != 0) { + if (offsets_compressor_) { auto d_size = offsets_compressor_->Decompress( PaxNonFixedColumn::offsets_->Start(), PaxNonFixedColumn::offsets_->Capacity(), offsets->Start(), @@ -182,28 +202,42 @@ void PaxNonFixedEncodingColumn::Set(std::shared_ptr> data, fmt("Decompress failed, %s", compressor_->ErrorName(d_size))); } PaxNonFixedColumn::offsets_->Brush(d_size); + return; + } + + if (offsets_decoder_) { + // Decode offsets using encoder for int32 stream + shared_offsets_data_ = std::make_shared>( + PaxNonFixedColumn::offsets_->Start(), + PaxNonFixedColumn::offsets_->Capacity(), false, false); + offsets_decoder_->SetDataBuffer(shared_offsets_data_); + offsets_decoder_->SetSrcBuffer(offsets->Start(), offsets->Used()); + offsets_decoder_->Decoding(); + PaxNonFixedColumn::offsets_->Brush(shared_offsets_data_->Used()); + return; } }; exist_decoder = compressor_ || decoder_; + bool has_offsets_processor = offsets_compressor_ || offsets_decoder_; - if (exist_decoder && offsets_compressor_) { + if (exist_decoder && has_offsets_processor) { data_decompress(); offsets_decompress(); PaxNonFixedColumn::estimated_size_ = total_size; PaxNonFixedColumn::next_offsets_ = -1; - } else if (exist_decoder && !offsets_compressor_) { + } else if (exist_decoder && !has_offsets_processor) { data_decompress(); - PaxNonFixedColumn::offsets_ = offsets; + PaxNonFixedColumn::offsets_ = std::move(offsets); PaxNonFixedColumn::estimated_size_ = total_size; PaxNonFixedColumn::next_offsets_ = -1; - } else if (!exist_decoder && offsets_compressor_) { - PaxNonFixedColumn::data_ = data; + } else if (!exist_decoder && has_offsets_processor) { + PaxNonFixedColumn::data_ = std::move(data); offsets_decompress(); PaxNonFixedColumn::estimated_size_ = total_size; PaxNonFixedColumn::next_offsets_ = -1; } else { // (!compressor_ && !offsets_compressor_) - PaxNonFixedColumn::Set(data, offsets, total_size); + PaxNonFixedColumn::Set(std::move(data), std::move(offsets), total_size); } } @@ -278,17 +312,17 @@ std::pair PaxNonFixedEncodingColumn::GetOffsetBuffer( AppendLastOffset(); } - if (offsets_compressor_ && compress_route_) { - if (shared_offsets_data_) { - return std::make_pair(shared_offsets_data_->Start(), - shared_offsets_data_->Used()); - } + if (shared_offsets_data_) { + return std::make_pair(shared_offsets_data_->Start(), + shared_offsets_data_->Used()); + } - if (PaxNonFixedColumn::offsets_->Used() == 0) { - // should never append last offset again - return PaxNonFixedColumn::GetOffsetBuffer(false); - } + if (PaxNonFixedColumn::offsets_->Used() == 0) { + // should never append last offset again + return PaxNonFixedColumn::GetOffsetBuffer(false); + } + if (offsets_compressor_ && compress_route_) { size_t bound_size = offsets_compressor_->GetCompressBound( PaxNonFixedColumn::offsets_->Used()); shared_offsets_data_ = std::make_shared>(bound_size); @@ -308,6 +342,20 @@ std::pair PaxNonFixedEncodingColumn::GetOffsetBuffer( shared_offsets_data_->Used()); } + if (offsets_encoder_ && compress_route_) { + // For delta encoder, allocate a buffer sized by raw bytes for safety + size_t bound_size = offsets_encoder_->GetBoundSize(offsets_->Used()); + shared_offsets_data_ = std::make_shared>(bound_size); + offsets_encoder_->SetDataBuffer(shared_offsets_data_); + + // Encode entire offsets buffer as a single stream + offsets_encoder_->Append(offsets_->Start(), offsets_->Used()); + offsets_encoder_->Flush(); + + return std::make_pair(shared_offsets_data_->Start(), + shared_offsets_data_->Used()); + } + // no compress or uncompressed // should never append last offset again return PaxNonFixedColumn::GetOffsetBuffer(false); diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.h b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.h index b4e956cfe4a..f5f5fd82128 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.h +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_non_fixed_column.h @@ -42,8 +42,8 @@ class PaxNonFixedEncodingColumn : public PaxNonFixedColumn { ~PaxNonFixedEncodingColumn() override; - void Set(std::shared_ptr> data, - std::shared_ptr> offsets, + void Set(std::unique_ptr> data, + std::unique_ptr> offsets, size_t total_size) override; std::pair GetBuffer() override; @@ -83,6 +83,9 @@ class PaxNonFixedEncodingColumn : public PaxNonFixedColumn { std::shared_ptr> shared_data_; std::shared_ptr offsets_compressor_; + // Optional encoder/decoder for offsets stream (alternative to compression) + std::shared_ptr offsets_encoder_; + std::shared_ptr offsets_decoder_; std::shared_ptr> shared_offsets_data_; }; diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_test.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_test.cc index 5fa7fb7153c..b3a7ec59458 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_test.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_encoding_test.cc @@ -1361,4 +1361,96 @@ TEST_F(PaxEncodingTest, TestEncodingWithAllNULL) { ASSERT_EQ(n_read, shared_dst_data->Used()); } +TEST_F(PaxEncodingTest, TestPaxDeltaEncodingBasic) { + std::vector data_vec{100, 101, 102, 105, 106, 110, 120, 121}; + auto shared_data = std::make_shared>(1024); + auto shared_dst_data = std::make_shared>(1024); + + PaxEncoder::EncodingOption encoder_options; + encoder_options.column_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + encoder_options.is_sign = false; + auto encoder = PaxEncoder::CreateStreamingEncoder(encoder_options); + + ASSERT_TRUE(encoder); + encoder->SetDataBuffer(shared_data); + encoder->Append(reinterpret_cast(data_vec.data()), data_vec.size() * sizeof(uint32_t)); + encoder->Flush(); + + ASSERT_NE(encoder->GetBuffer(), nullptr); + ASSERT_GT(encoder->GetBufferSize(), 0UL); + + PaxDecoder::DecodingOption decoder_options; + decoder_options.column_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + decoder_options.is_sign = false; + + auto decoder = PaxDecoder::CreateDecoder(decoder_options); + ASSERT_TRUE(decoder); + decoder->SetSrcBuffer(shared_data->GetBuffer(), shared_data->Used()); + + decoder->SetDataBuffer(shared_dst_data); + decoder->Decoding(); + + ASSERT_EQ(shared_dst_data->Used(), data_vec.size() * sizeof(int32)); + + auto result_dst_data = std::make_shared>( + reinterpret_cast(shared_dst_data->Start()), + shared_dst_data->Used(), false, false); + + for (size_t i = 0; i < data_vec.size(); ++i) { + ASSERT_EQ((*result_dst_data)[i], static_cast(data_vec[i])); + } +} + +TEST_F(PaxEncodingTest, TestPaxDeltaEncodingRoundTripRandom) { + const size_t n = 1000; + std::vector data_vec(n); + std::mt19937 rng(12345); + std::uniform_int_distribution base_dist(0, 100); + std::uniform_int_distribution step_dist(0, 5); + + data_vec[0] = base_dist(rng); + for (size_t i = 1; i < n; ++i) { + data_vec[i] = data_vec[i - 1] + step_dist(rng); + } + + auto shared_data = std::make_shared>(n * sizeof(uint32_t)); + auto shared_dst_data = std::make_shared>(n * sizeof(uint32_t)); + + PaxEncoder::EncodingOption encoder_options; + encoder_options.column_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + encoder_options.is_sign = false; + auto encoder = PaxEncoder::CreateStreamingEncoder(encoder_options); + + ASSERT_TRUE(encoder); + encoder->SetDataBuffer(shared_data); + + encoder->Append(reinterpret_cast(data_vec.data()), data_vec.size() * sizeof(uint32_t)); + encoder->Flush(); + + PaxDecoder::DecodingOption decoder_options; + decoder_options.column_encode_type = + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA; + decoder_options.is_sign = false; + + auto decoder = PaxDecoder::CreateDecoder(decoder_options); + ASSERT_TRUE(decoder); + decoder->SetSrcBuffer(shared_data->GetBuffer(), shared_data->Used()); + + decoder->SetDataBuffer(shared_dst_data); + decoder->Decoding(); + + ASSERT_EQ(shared_dst_data->Used(), data_vec.size() * sizeof(int32)); + + auto result_dst_data = std::make_shared>( + reinterpret_cast(shared_dst_data->Start()), + shared_dst_data->Used(), false, false); + + for (size_t i = 0; i < data_vec.size(); ++i) { + ASSERT_EQ((*result_dst_data)[i], static_cast(data_vec[i])); + } +} + } // namespace pax::tests diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_rlev2_encoding.h b/contrib/pax_storage/src/cpp/storage/columns/pax_rlev2_encoding.h index 7d021a1f1cf..f2197258b69 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_rlev2_encoding.h +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_rlev2_encoding.h @@ -49,6 +49,10 @@ class PaxOrcEncoder final : public PaxEncoder { void Flush() override; + size_t GetBoundSize(size_t src_len) const override { + CBDB_RAISE(cbdb::CException::kExTypeUnImplements); + } + private: struct EncoderContext { bool is_sign; diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_vec_encoding_column.cc b/contrib/pax_storage/src/cpp/storage/columns/pax_vec_encoding_column.cc index aaf514f5926..8f3aafae2c4 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_vec_encoding_column.cc +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_vec_encoding_column.cc @@ -348,7 +348,7 @@ void PaxVecNonFixedEncodingColumn::Set( PaxVecNonFixedColumn::estimated_size_ = total_size; PaxVecNonFixedColumn::next_offsets_ = -1; } else { // (!compressor_ && !offsets_compressor_) - PaxVecNonFixedColumn::Set(data, offsets_, total_size, non_null_rows); + PaxVecNonFixedColumn::Set(data, offsets, total_size, non_null_rows); } } diff --git a/contrib/pax_storage/src/cpp/storage/columns/pax_vec_encoding_column.h b/contrib/pax_storage/src/cpp/storage/columns/pax_vec_encoding_column.h index 4362312a5a9..524ddca261a 100644 --- a/contrib/pax_storage/src/cpp/storage/columns/pax_vec_encoding_column.h +++ b/contrib/pax_storage/src/cpp/storage/columns/pax_vec_encoding_column.h @@ -112,6 +112,9 @@ class PaxVecNonFixedEncodingColumn : public PaxVecNonFixedColumn { std::shared_ptr> shared_data_; std::shared_ptr offsets_compressor_; + // Optional encoder/decoder for offsets stream (alternative to compression) + std::shared_ptr offsets_encoder_; + std::shared_ptr offsets_decoder_; std::shared_ptr> shared_offsets_data_; }; diff --git a/contrib/pax_storage/src/cpp/storage/file_system.cc b/contrib/pax_storage/src/cpp/storage/file_system.cc index fa0667241e8..2ac4803ba8c 100644 --- a/contrib/pax_storage/src/cpp/storage/file_system.cc +++ b/contrib/pax_storage/src/cpp/storage/file_system.cc @@ -65,4 +65,14 @@ void File::PWriteN(const void *buf, size_t count, off64_t offset) { "errno=%d], %s", offset, count, num, errno, DebugString().c_str())); } + +void File::ReadBatch(const std::vector &requests) const { + if (requests.empty()) { + return; + } + for (const auto &req : requests) { + PReadN(req.buffer, req.size, req.offset); + } +} + } // namespace pax diff --git a/contrib/pax_storage/src/cpp/storage/file_system.h b/contrib/pax_storage/src/cpp/storage/file_system.h index ca1af8877cc..6569ee3b858 100644 --- a/contrib/pax_storage/src/cpp/storage/file_system.h +++ b/contrib/pax_storage/src/cpp/storage/file_system.h @@ -33,6 +33,7 @@ #include #include +#include "comm/common_io.h" #include "comm/pax_memory.h" namespace pax { @@ -74,6 +75,7 @@ class File { virtual void WriteN(const void *ptr, size_t n); virtual void PWriteN(const void *buf, size_t count, off_t offset); virtual void PReadN(void *buf, size_t count, off_t offset) const; + virtual void ReadBatch(const std::vector &requests) const; virtual void Flush() = 0; virtual void Delete() = 0; diff --git a/contrib/pax_storage/src/cpp/storage/filter/pax_filter.cc b/contrib/pax_storage/src/cpp/storage/filter/pax_filter.cc index d7f752a33ec..5f19ab58400 100644 --- a/contrib/pax_storage/src/cpp/storage/filter/pax_filter.cc +++ b/contrib/pax_storage/src/cpp/storage/filter/pax_filter.cc @@ -44,11 +44,12 @@ namespace pax { PaxFilter::PaxFilter() : sparse_filter_(nullptr), row_filter_(nullptr) {} void PaxFilter::InitSparseFilter(Relation relation, List *quals, + ScanKey key, int nkeys, bool allow_fallback_to_pg) { Assert(!sparse_filter_); sparse_filter_ = std::make_shared(relation, allow_fallback_to_pg); - sparse_filter_->Initialize(quals); + sparse_filter_->Initialize(quals, key, nkeys); } #ifdef VEC_BUILD diff --git a/contrib/pax_storage/src/cpp/storage/filter/pax_filter.h b/contrib/pax_storage/src/cpp/storage/filter/pax_filter.h index 467b841ec89..ebc2fff8538 100644 --- a/contrib/pax_storage/src/cpp/storage/filter/pax_filter.h +++ b/contrib/pax_storage/src/cpp/storage/filter/pax_filter.h @@ -50,7 +50,7 @@ class PaxFilter final { ~PaxFilter() = default; // The sparse filter - void InitSparseFilter(Relation relation, List *quals, + void InitSparseFilter(Relation relation, List *quals, ScanKey key, int nkeys, bool allow_fallback_to_pg = false); #ifdef VEC_BUILD void InitSparseFilter( diff --git a/contrib/pax_storage/src/cpp/storage/filter/pax_sparse_filter.h b/contrib/pax_storage/src/cpp/storage/filter/pax_sparse_filter.h index 504878c4dd2..6efa59a7ff6 100644 --- a/contrib/pax_storage/src/cpp/storage/filter/pax_sparse_filter.h +++ b/contrib/pax_storage/src/cpp/storage/filter/pax_sparse_filter.h @@ -65,7 +65,7 @@ class PaxSparseFilter final { bool ExistsFilterPath() const; - void Initialize(List *quals); + void Initialize(List *quals, ScanKey key, int nkeys); #ifdef VEC_BUILD void Initialize( @@ -83,6 +83,8 @@ class PaxSparseFilter final { private: #endif + std::shared_ptr ProcessScanKey(ScanKey key); + // Used to build the filter tree with the PG quals std::shared_ptr ExprWalker(Expr *expr); Expr *ExprFlatVar(Expr *expr); diff --git a/contrib/pax_storage/src/cpp/storage/filter/pax_sparse_pg_path.cc b/contrib/pax_storage/src/cpp/storage/filter/pax_sparse_pg_path.cc index 0630db6dc21..3a7bc64f389 100644 --- a/contrib/pax_storage/src/cpp/storage/filter/pax_sparse_pg_path.cc +++ b/contrib/pax_storage/src/cpp/storage/filter/pax_sparse_pg_path.cc @@ -36,7 +36,7 @@ namespace pax { -void PaxSparseFilter::Initialize(List *quals) { +void PaxSparseFilter::Initialize(List *quals, ScanKey key, int nkeys) { ListCell *qual_cell; std::vector> fl_nodes; /* first level nodes */ std::string origin_tree_str; @@ -44,10 +44,27 @@ void PaxSparseFilter::Initialize(List *quals) { // no inited Assert(!filter_tree_); - if (!quals) { + if (!quals && nkeys == 0) { return; } + // walk scan key and only support min/max filter now + for (int i = 0; i < nkeys; i++) { + // TODO: support bloom filter in PaxFilter + // but now just skip it, SeqNext() will check bloom filter in PassByBloomFilter() + if (key[i].sk_flags & SK_BLOOM_FILTER) { + continue; + } + + if (key[i].sk_strategy != BTGreaterEqualStrategyNumber && + key[i].sk_strategy != BTLessEqualStrategyNumber) { + continue; + } + std::shared_ptr fl_node = ProcessScanKey(&key[i]); + Assert(fl_node); + fl_nodes.emplace_back(std::move(fl_node)); + } + foreach (qual_cell, quals) { Expr *fl_clause = (Expr *)lfirst(qual_cell); std::shared_ptr fl_node = ExprWalker(fl_clause); @@ -67,6 +84,47 @@ void PaxSparseFilter::Initialize(List *quals) { origin_tree_str.c_str(), DebugString().c_str()); } +std::shared_ptr PaxSparseFilter::ProcessScanKey(ScanKey key) { + std::shared_ptr node = nullptr; + Assert(key); + Assert(!(key->sk_flags & SK_BLOOM_FILTER)); + Assert(key->sk_strategy == BTGreaterEqualStrategyNumber || + key->sk_strategy == BTLessEqualStrategyNumber); + Assert(key->sk_attno > 0 && + key->sk_attno <= RelationGetNumberOfAttributes(rel_)); + + AttrNumber attno = key->sk_attno; + + // Build VarNode on the left + auto var_node = std::make_shared(); + var_node->attrno = attno; + + // Build ConstNode on the right from ScanKey + auto const_node = std::make_shared(); + const_node->const_val = key->sk_argument; + const_node->const_type = key->sk_subtype; + if (key->sk_flags & SK_ISNULL) { + const_node->sk_flags |= SK_ISNULL; + } + + // Build OpNode and attach children: (var, const) + auto op_node = std::make_shared(); + op_node->strategy = key->sk_strategy; + op_node->collation = key->sk_collation; // may be InvalidOid; executor will + // fallback to attr collation + + // Set operand types + Form_pg_attribute attr = TupleDescAttr(RelationGetDescr(rel_), attno - 1); + op_node->left_typid = attr->atttypid; + op_node->right_typid = key->sk_subtype; + + PFTNode::AppendSubNode(op_node, std::move(var_node)); + PFTNode::AppendSubNode(op_node, std::move(const_node)); + + node = op_node; + return node; +} + Expr *PaxSparseFilter::ExprFlatVar(Expr *clause) { Expr *flat_clause = clause; if (unlikely(!clause)) { diff --git a/contrib/pax_storage/src/cpp/storage/local_file_system.cc b/contrib/pax_storage/src/cpp/storage/local_file_system.cc index 1c71eceb7a9..82afafbfcfd 100644 --- a/contrib/pax_storage/src/cpp/storage/local_file_system.cc +++ b/contrib/pax_storage/src/cpp/storage/local_file_system.cc @@ -35,6 +35,7 @@ #include "access/pax_access_handle.h" #include "comm/cbdb_wrappers.h" +#include "comm/fast_io.h" #include "comm/fmt.h" #include "comm/pax_memory.h" #include "comm/pax_resource.h" @@ -51,6 +52,7 @@ class LocalFile final : public File { ssize_t Write(const void *ptr, size_t n) override; ssize_t PWrite(const void *ptr, size_t n, off_t offset) override; ssize_t PRead(void *ptr, size_t n, off_t offset) const override; + void ReadBatch(const std::vector &requests) const override; size_t FileLength() const override; void Flush() override; void Delete() override; @@ -132,6 +134,26 @@ ssize_t LocalFile::PWrite(const void *ptr, size_t n, off_t offset) { return num; } +void LocalFile::ReadBatch(const std::vector &requests) const { + if (unlikely(requests.empty())) return; + + if (IOUringFastIO::available()) { + IOUringFastIO fast_io(requests.size()); + std::vector result(requests.size(), false); + auto res = fast_io.read(fd_, const_cast&>(requests), result); + CBDB_CHECK(res.first == 0, cbdb::CException::ExType::kExTypeIOError, + fmt("Fail to ReadBatch with io_uring [successful=%d, total=%lu], %s", + res.second, requests.size(), DebugString().c_str())); + } else { + SyncFastIO fast_io; + std::vector result(requests.size(), false); + auto res = fast_io.read(fd_, const_cast&>(requests), result); + CBDB_CHECK(res.first == 0, cbdb::CException::ExType::kExTypeIOError, + fmt("Fail to ReadBatch with sync read [successful=%d, total=%lu], %s", + res.second, requests.size(), DebugString().c_str())); + } +} + size_t LocalFile::FileLength() const { struct stat file_stat {}; int rc; diff --git a/contrib/pax_storage/src/cpp/storage/micro_partition.cc b/contrib/pax_storage/src/cpp/storage/micro_partition.cc index 57004d7d3b8..ead41b41f34 100644 --- a/contrib/pax_storage/src/cpp/storage/micro_partition.cc +++ b/contrib/pax_storage/src/cpp/storage/micro_partition.cc @@ -91,7 +91,7 @@ size_t MicroPartitionReaderProxy::GetTupleCountsInGroup(size_t group_index) { std::unique_ptr MicroPartitionReaderProxy::GetGroupStatsInfo(size_t group_index) { - return std::move(reader_->GetGroupStatsInfo(group_index)); + return reader_->GetGroupStatsInfo(group_index); } std::unique_ptr diff --git a/contrib/pax_storage/src/cpp/storage/micro_partition.h b/contrib/pax_storage/src/cpp/storage/micro_partition.h index 8c71cfbd574..56d85b46a74 100644 --- a/contrib/pax_storage/src/cpp/storage/micro_partition.h +++ b/contrib/pax_storage/src/cpp/storage/micro_partition.h @@ -58,7 +58,6 @@ class MicroPartitionWriter { RelFileNode node; bool need_wal = false; std::vector> encoding_opts; - std::pair offsets_encoding_opts; std::vector enable_min_max_col_idxs; std::vector enable_bf_col_idxs; @@ -181,7 +180,7 @@ class MicroPartitionReader { // fetch, compression/encoding. At the same time, pax column can also be // used as a general interface for internal using, because it's zero copy // from buffer. more details in `storage/columns` - virtual const std::shared_ptr &GetAllColumns() const = 0; + virtual const std::unique_ptr &GetAllColumns() const = 0; virtual void SetVisibilityMap( std::shared_ptr visibility_bitmap) = 0; @@ -204,12 +203,10 @@ class MicroPartitionReader { std::shared_ptr filter; -#ifdef VEC_BUILD - TupleDesc tuple_desc = nullptr; -#endif - // should only reference std::shared_ptr visibility_bitmap = nullptr; + + TupleDesc tuple_desc = nullptr; }; MicroPartitionReader() = default; diff --git a/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory.cc b/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory.cc index b1800edb012..01319e5ea86 100644 --- a/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory.cc +++ b/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory.cc @@ -38,9 +38,9 @@ namespace pax { std::unique_ptr MicroPartitionFileFactory::CreateMicroPartitionReader( const MicroPartitionReader::ReaderOptions &options, int32 flags, - std::shared_ptr file, std::shared_ptr toast_file) { + std::unique_ptr file, std::unique_ptr toast_file) { std::unique_ptr reader = - std::make_unique(file, toast_file); + std::make_unique(std::move(file), std::move(toast_file)); #ifdef VEC_BUILD if (flags & ReaderFlags::FLAGS_VECTOR_PATH) { @@ -63,13 +63,13 @@ MicroPartitionFileFactory::CreateMicroPartitionReader( std::unique_ptr MicroPartitionFileFactory::CreateMicroPartitionWriter( const MicroPartitionWriter::WriterOptions &options, - std::shared_ptr file, std::shared_ptr toast_file) { + std::unique_ptr file, std::unique_ptr toast_file) { std::vector type_kinds; type_kinds = OrcWriter::BuildSchema( options.rel_tuple_desc, options.storage_format == PaxStorageFormat::kTypeStoragePorcVec); - return std::make_unique(options, std::move(type_kinds), file, - toast_file); + return std::make_unique(options, std::move(type_kinds), + std::move(file), std::move(toast_file)); } } // namespace pax diff --git a/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory.h b/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory.h index fbfef5a16b3..bc2f04056d2 100644 --- a/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory.h +++ b/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory.h @@ -52,13 +52,13 @@ class MicroPartitionFileFactory final { public: static std::unique_ptr CreateMicroPartitionWriter( const MicroPartitionWriter::WriterOptions &options, - std::shared_ptr file, - std::shared_ptr toast_file = nullptr); + std::unique_ptr file, + std::unique_ptr toast_file = nullptr); static std::unique_ptr CreateMicroPartitionReader( const MicroPartitionReader::ReaderOptions &options, int32 flags, - std::shared_ptr file, - std::shared_ptr toast_file = nullptr); + std::unique_ptr file, + std::unique_ptr toast_file = nullptr); }; } // namespace pax diff --git a/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory_test.cc b/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory_test.cc index fd66dd3f7ff..59d4bc0f6a1 100644 --- a/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory_test.cc +++ b/contrib/pax_storage/src/cpp/storage/micro_partition_file_factory_test.cc @@ -72,8 +72,8 @@ TEST_F(MicroPartitionFileFactoryTest, CreateMicroPartitionWriter) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); std::vector> types_encoding; types_encoding.emplace_back( @@ -88,7 +88,7 @@ TEST_F(MicroPartitionFileFactoryTest, CreateMicroPartitionWriter) { writer_options.encoding_opts = types_encoding; auto writer = MicroPartitionFileFactory::CreateMicroPartitionWriter( - writer_options, file_ptr); + writer_options, std::move(file_ptr)); writer->WriteTuple(tuple_slot); writer->Close(); @@ -101,8 +101,8 @@ TEST_F(MicroPartitionFileFactoryTest, CreateMicroPartitionReader) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); std::vector> types_encoding; types_encoding.emplace_back( @@ -117,7 +117,7 @@ TEST_F(MicroPartitionFileFactoryTest, CreateMicroPartitionReader) { writer_options.encoding_opts = types_encoding; auto writer = MicroPartitionFileFactory::CreateMicroPartitionWriter( - writer_options, file_ptr); + writer_options, std::move(file_ptr)); TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(false); writer->WriteTuple(tuple_slot); @@ -130,7 +130,7 @@ TEST_F(MicroPartitionFileFactoryTest, CreateMicroPartitionReader) { int32 flags = FLAGS_EMPTY; auto reader = MicroPartitionFileFactory::CreateMicroPartitionReader( - reader_options, flags, file_ptr); + reader_options, flags, std::move(file_ptr)); reader->ReadTuple(tuple_slot_empty); EXPECT_TRUE(VerifyTestTupleTableSlot(tuple_slot_empty)); @@ -145,8 +145,8 @@ TEST_F(MicroPartitionFileFactoryTest, OrcReadWithVisibilitymap) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); std::vector> types_encoding; types_encoding.emplace_back( @@ -161,7 +161,7 @@ TEST_F(MicroPartitionFileFactoryTest, OrcReadWithVisibilitymap) { writer_options.encoding_opts = types_encoding; auto writer = MicroPartitionFileFactory::CreateMicroPartitionWriter( - writer_options, file_ptr); + writer_options, std::move(file_ptr)); int tuple_count = 1000; for (int i = 0; i < tuple_count; i++) { @@ -186,7 +186,7 @@ TEST_F(MicroPartitionFileFactoryTest, OrcReadWithVisibilitymap) { TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(); auto reader = MicroPartitionFileFactory::CreateMicroPartitionReader( - reader_options, flags, file_ptr); + reader_options, flags, std::move(file_ptr)); int read_tuple_count = 0; while (reader->ReadTuple(tuple_slot_empty)) { @@ -210,8 +210,8 @@ TEST_F(MicroPartitionFileFactoryTest, VecReadWithVisibilitymap) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); std::vector> types_encoding; types_encoding.emplace_back( @@ -226,7 +226,7 @@ TEST_F(MicroPartitionFileFactoryTest, VecReadWithVisibilitymap) { writer_options.encoding_opts = types_encoding; auto writer = MicroPartitionFileFactory::CreateMicroPartitionWriter( - writer_options, file_ptr); + writer_options, std::move(file_ptr)); int tuple_count = 1000; for (int i = 0; i < tuple_count; i++) { @@ -264,7 +264,7 @@ TEST_F(MicroPartitionFileFactoryTest, VecReadWithVisibilitymap) { CreateVecEmptyTupleSlot(tuple_slot->tts_tupleDescriptor); auto reader = MicroPartitionFileFactory::CreateMicroPartitionReader( - reader_options, flags, file_ptr); + reader_options, flags, std::move(file_ptr)); auto ret = reader->ReadTuple(read_tuple_slot); ASSERT_TRUE(ret); diff --git a/contrib/pax_storage/src/cpp/storage/micro_partition_iterator.cc b/contrib/pax_storage/src/cpp/storage/micro_partition_iterator.cc index 9c9a1f8c8cd..bb9590ca5e9 100644 --- a/contrib/pax_storage/src/cpp/storage/micro_partition_iterator.cc +++ b/contrib/pax_storage/src/cpp/storage/micro_partition_iterator.cc @@ -164,7 +164,7 @@ MicroPartitionMetadata MicroPartitionInfoIterator::Next() { Assert(tuple); tuple_ = nullptr; - return std::move(ToValue(tuple)); + return ToValue(tuple); } void MicroPartitionInfoIterator::Rewind() { @@ -375,7 +375,7 @@ MicroPartitionMetadata MicroPartitionInfoParallelIterator::Next() { Assert(tuple); tuple_ = nullptr; - return std::move(ToValue(tuple)); + return ToValue(tuple); } void MicroPartitionInfoParallelIterator::Rewind() { diff --git a/contrib/pax_storage/src/cpp/storage/micro_partition_stats.cc b/contrib/pax_storage/src/cpp/storage/micro_partition_stats.cc index b18ab952527..53342ad0e2a 100644 --- a/contrib/pax_storage/src/cpp/storage/micro_partition_stats.cc +++ b/contrib/pax_storage/src/cpp/storage/micro_partition_stats.cc @@ -173,7 +173,6 @@ static bool PrepareStatisticsInfoCombine( auto attr = TupleDescAttr(desc, i); auto collation = attr->attcollation; - FmgrInfo finfo; bool get_pg_oper_succ = false; funcs.emplace_back(std::make_pair(nullptr, nullptr)); @@ -207,7 +206,6 @@ static bool PrepareStatisticsInfoCombine( GetStrategyProcinfo(attr->atttypid, attr->atttypid, funcs[i]); if (allow_fallback_to_pg) { - finfos[i] = {finfo, finfo}; get_pg_oper_succ = GetStrategyProcinfo(attr->atttypid, attr->atttypid, finfos[i]); } diff --git a/contrib/pax_storage/src/cpp/storage/micro_partition_stats_test.cc b/contrib/pax_storage/src/cpp/storage/micro_partition_stats_test.cc index a589a3971fa..c18aa491432 100644 --- a/contrib/pax_storage/src/cpp/storage/micro_partition_stats_test.cc +++ b/contrib/pax_storage/src/cpp/storage/micro_partition_stats_test.cc @@ -51,36 +51,10 @@ TEST_F(MicroPartitionStatsTest, MicroPartitionStatsInfoCombine) { cbdb::Palloc0(sizeof(TupleDescData) + sizeof(FormData_pg_attribute) * 4)); tuple_desc->natts = 4; - tuple_desc->attrs[0] = {.atttypid = INT4OID, - .attlen = 4, - .attbyval = true, - .attalign = TYPALIGN_INT, - .attstorage = TYPSTORAGE_PLAIN, - .attisdropped = false, - .attcollation = InvalidOid}; - - tuple_desc->attrs[1] = {.atttypid = TEXTOID, - .attlen = -1, - .attbyval = false, - .attalign = TYPALIGN_DOUBLE, - .attstorage = TYPSTORAGE_PLAIN, - .attisdropped = false, - .attcollation = DEFAULT_COLLATION_OID}; - - tuple_desc->attrs[2] = {.atttypid = INT4OID, - .attlen = 4, - .attbyval = true, - .attalign = TYPALIGN_INT, - .attstorage = TYPSTORAGE_PLAIN, - .attisdropped = false, - .attcollation = InvalidOid}; - tuple_desc->attrs[3] = {.atttypid = INT4OID, - .attlen = 4, - .attbyval = true, - .attalign = TYPALIGN_INT, - .attstorage = TYPSTORAGE_PLAIN, - .attisdropped = false, - .attcollation = InvalidOid}; + InitAttribute_int4(&tuple_desc->attrs[0]); + InitAttribute_text(&tuple_desc->attrs[1]); + InitAttribute_int4(&tuple_desc->attrs[2]); + InitAttribute_int4(&tuple_desc->attrs[3]); auto col_stats1_1 = mp_stats_info1.add_columnstats(); auto col_stats1_2 = mp_stats_info1.add_columnstats(); @@ -304,4 +278,4 @@ TEST_F(MicroPartitionStatsTest, MicroPartitionStatsInfoCombine) { ASSERT_EQ(min_datum, cbdb::Int32ToDatum(50)); } -} // namespace pax::tests \ No newline at end of file +} // namespace pax::tests diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_dump_reader.cpp b/contrib/pax_storage/src/cpp/storage/orc/orc_dump_reader.cpp index 16f8f93a833..f1f1eefd1df 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_dump_reader.cpp +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_dump_reader.cpp @@ -91,8 +91,8 @@ OrcDumpReader::OrcDumpReader(DumpConfig *config) bool OrcDumpReader::Open() { FileSystem *fs = nullptr; std::shared_ptr fs_opt; - std::shared_ptr open_file; - std::shared_ptr open_toast_file; + std::unique_ptr open_file; + std::unique_ptr open_toast_file; assert(config_); assert(config_->file_name); @@ -111,7 +111,7 @@ bool OrcDumpReader::Open() { } } - format_reader_ = new OrcFormatReader(open_file, open_toast_file); + format_reader_ = new OrcFormatReader(std::move(open_file), std::move(open_toast_file)); format_reader_->Open(); return true; diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_format_reader.cc b/contrib/pax_storage/src/cpp/storage/orc/orc_format_reader.cc index 7ee6df8ef08..ad1736f4f08 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_format_reader.cc +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_format_reader.cc @@ -35,8 +35,8 @@ namespace pax { -OrcFormatReader::OrcFormatReader(std::shared_ptr file, - std::shared_ptr toast_file) +OrcFormatReader::OrcFormatReader(std::unique_ptr file, + std::unique_ptr toast_file) : file_(std::move(file)), toast_file_(std::move(toast_file)), reused_buffer_(nullptr), @@ -327,6 +327,7 @@ pax::porc::proto::StripeFooter OrcFormatReader::ReadStripeWithProjection( batch_offset = stripe_footer_offset; + std::vector io_requests; while (index < column_types_.size()) { // Current column have been skipped // Move `batch_offset` and `streams_index` to the right position @@ -398,10 +399,17 @@ pax::porc::proto::StripeFooter OrcFormatReader::ReadStripeWithProjection( continue; } - file_->PReadN(data_buffer->GetAvailableBuffer(), batch_len, batch_offset); + { + IORequest io_request; + io_request.offset = batch_offset; + io_request.size = batch_len; + io_request.buffer = data_buffer->GetAvailableBuffer(); + io_requests.emplace_back(io_request); + } data_buffer->Brush(batch_len); batch_offset += batch_len; } + file_->ReadBatch(io_requests); return stripe_footer; } @@ -413,14 +421,14 @@ static std::unique_ptr BuildEncodingColumn( const ColumnEncoding &data_encoding, bool is_vec) { uint32 not_null_rows = 0; uint64 data_stream_len = 0; - std::shared_ptr> data_stream_buffer; + std::unique_ptr> data_stream_buffer; Assert(data_stream.kind() == pax::porc::proto::Stream_Kind_DATA); not_null_rows = static_cast(data_stream.column()); data_stream_len = static_cast(data_stream.length()); - data_stream_buffer = std::make_shared>( + data_stream_buffer = std::make_unique>( reinterpret_cast(data_buffer->GetAvailableBuffer()), data_stream_len, false, false); @@ -443,7 +451,7 @@ static std::unique_ptr BuildEncodingColumn( auto pax_column = traits::ColumnOptCreateTraits::create_decoding( alloc_size, decoding_option); - pax_column->Set(data_stream_buffer, (size_t)not_null_rows); + pax_column->Set(std::move(data_stream_buffer), (size_t)not_null_rows); return pax_column; } else { AssertImply(data_encoding.kind() == @@ -455,7 +463,7 @@ static std::unique_ptr BuildEncodingColumn( auto pax_column = traits::ColumnOptCreateTraits::create_decoding( alloc_size, decoding_option); - pax_column->Set(data_stream_buffer); + pax_column->Set(std::move(data_stream_buffer)); return pax_column; } } @@ -466,14 +474,14 @@ static std::unique_ptr BuildEncodingBitPackedColumn( bool is_vec) { uint32 not_null_rows = 0; uint64 column_data_len = 0; - std::shared_ptr> column_data_buffer; + std::unique_ptr> column_data_buffer; Assert(data_stream.kind() == pax::porc::proto::Stream_Kind_DATA); not_null_rows = static_cast(data_stream.column()); column_data_len = static_cast(data_stream.length()); - column_data_buffer = std::make_shared>( + column_data_buffer = std::make_unique>( reinterpret_cast(data_buffer->GetAvailableBuffer()), column_data_len, false, false); @@ -496,7 +504,7 @@ static std::unique_ptr BuildEncodingBitPackedColumn( auto pax_column = traits::ColumnOptCreateTraits2::create_decoding( alloc_size, decoding_option); - pax_column->Set(column_data_buffer, (size_t)not_null_rows); + pax_column->Set(std::move(column_data_buffer), (size_t)not_null_rows); return pax_column; } else { AssertImply(data_encoding.kind() == @@ -508,7 +516,7 @@ static std::unique_ptr BuildEncodingBitPackedColumn( auto pax_column = traits::ColumnOptCreateTraits2::create_decoding( alloc_size, decoding_option); - pax_column->Set(column_data_buffer); + pax_column->Set(std::move(column_data_buffer)); return pax_column; } } @@ -521,8 +529,8 @@ static std::unique_ptr BuildEncodingDecimalColumn( uint32 not_null_rows = 0; uint64 offset_stream_len = 0; uint64 data_stream_len = 0; - std::shared_ptr> offset_stream_buffer; - std::shared_ptr> data_stream_buffer; + std::unique_ptr> offset_stream_buffer; + std::unique_ptr> data_stream_buffer; std::unique_ptr pax_column; uint64 padding = 0; @@ -530,7 +538,7 @@ static std::unique_ptr BuildEncodingDecimalColumn( offset_stream_len = static_cast(len_stream.length()); padding = len_stream.padding(); - offset_stream_buffer = std::make_shared>( + offset_stream_buffer = std::make_unique>( reinterpret_cast(data_buffer->GetAvailableBuffer()), offset_stream_len, false, false); @@ -560,7 +568,7 @@ static std::unique_ptr BuildEncodingDecimalColumn( } #endif - data_stream_buffer = std::make_shared>( + data_stream_buffer = std::make_unique>( data_buffer->GetAvailableBuffer(), data_stream_len, false, false); data_stream_buffer->BrushAll(); data_buffer->Brush(data_stream_len); @@ -591,7 +599,7 @@ static std::unique_ptr BuildEncodingDecimalColumn( data_cap, offsets_cap, std::move(decoding_option)); // current memory will be freed in pax_columns->data_ - pax_column->Set(data_stream_buffer, offset_stream_buffer, data_stream_len); + pax_column->Set(std::move(data_stream_buffer), std::move(offset_stream_buffer), data_stream_len); return pax_column; } @@ -601,7 +609,7 @@ static std::unique_ptr BuildVecEncodingDecimalColumn( const ColumnEncoding &data_encoding, bool is_vec) { uint32 not_null_rows = 0; uint64 data_stream_len = 0; - std::shared_ptr> data_stream_buffer; + std::unique_ptr> data_stream_buffer; CBDB_CHECK(is_vec, cbdb::CException::ExType::kExTypeLogicError); @@ -610,7 +618,7 @@ static std::unique_ptr BuildVecEncodingDecimalColumn( not_null_rows = static_cast(data_stream.column()); data_stream_len = static_cast(data_stream.length()); - data_stream_buffer = std::make_shared>( + data_stream_buffer = std::make_unique>( reinterpret_cast(data_buffer->GetAvailableBuffer()), data_stream_len, false, false); @@ -630,7 +638,7 @@ static std::unique_ptr BuildVecEncodingDecimalColumn( auto pax_column = traits::ColumnOptCreateTraits2:: // create_decoding(alloc_size, decoding_option); - pax_column->Set(data_stream_buffer, (size_t)not_null_rows); + pax_column->Set(std::move(data_stream_buffer), (size_t)not_null_rows); return pax_column; } @@ -644,8 +652,8 @@ static std::unique_ptr BuildEncodingVecNonFixedColumn( uint64 offset_stream_len = 0; uint64 padding = 0; uint64 data_stream_len = 0; - std::shared_ptr> offset_stream_buffer; - std::shared_ptr> data_stream_buffer; + std::unique_ptr> offset_stream_buffer; + std::unique_ptr> data_stream_buffer; std::unique_ptr pax_column; PaxDecoder::DecodingOption decoding_option; size_t data_cap, offsets_cap; @@ -658,7 +666,7 @@ static std::unique_ptr BuildEncodingVecNonFixedColumn( offset_stream_len = static_cast(len_stream.length()); padding = len_stream.padding(); - offset_stream_buffer = std::make_shared>( + offset_stream_buffer = std::make_unique>( reinterpret_cast(data_buffer->GetAvailableBuffer()), offset_stream_len, false, false); @@ -676,7 +684,7 @@ static std::unique_ptr BuildEncodingVecNonFixedColumn( } data_buffer->Brush(offset_stream_len); - data_stream_buffer = std::make_shared>( + data_stream_buffer = std::make_unique>( data_buffer->GetAvailableBuffer(), data_stream_len, false, false); decoding_option.column_encode_type = data_encoding.kind(); @@ -730,7 +738,7 @@ static std::unique_ptr BuildEncodingVecNonFixedColumn( create_decoding(data_cap, offsets_cap, std::move(decoding_option)); } } - pax_column->Set(data_stream_buffer, offset_stream_buffer, data_stream_len, + pax_column->Set(std::move(data_stream_buffer), std::move(offset_stream_buffer), data_stream_len, not_null_rows); return pax_column; } @@ -743,8 +751,8 @@ static std::unique_ptr BuildEncodingNonFixedColumn( [[maybe_unused]] uint32 not_null_rows = 0; uint64 offset_stream_len = 0; uint64 data_stream_len = 0; - std::shared_ptr> offset_stream_buffer; - std::shared_ptr> data_stream_buffer; + std::unique_ptr> offset_stream_buffer; + std::unique_ptr> data_stream_buffer; std::unique_ptr pax_column; uint64 padding = 0; PaxDecoder::DecodingOption decoding_option; @@ -754,7 +762,7 @@ static std::unique_ptr BuildEncodingNonFixedColumn( offset_stream_len = static_cast(len_stream.length()); padding = len_stream.padding(); - offset_stream_buffer = std::make_shared>( + offset_stream_buffer = std::make_unique>( reinterpret_cast(data_buffer->GetAvailableBuffer()), offset_stream_len, false, false); @@ -776,7 +784,7 @@ static std::unique_ptr BuildEncodingNonFixedColumn( } #endif - data_stream_buffer = std::make_shared>( + data_stream_buffer = std::make_unique>( data_buffer->GetAvailableBuffer(), data_stream_len, false, false); data_stream_buffer->BrushAll(); data_buffer->Brush(data_stream_len); @@ -819,7 +827,7 @@ static std::unique_ptr BuildEncodingNonFixedColumn( } // current memory will be freed in pax_columns->data_ - pax_column->Set(data_stream_buffer, offset_stream_buffer, data_stream_len); + pax_column->Set(std::move(data_stream_buffer), std::move(offset_stream_buffer), data_stream_len); return pax_column; } @@ -899,7 +907,7 @@ std::unique_ptr OrcFormatReader::ReadStripe( continue; } - std::shared_ptr non_null_bitmap; + std::unique_ptr non_null_bitmap; bool has_null = stripe_info.colstats(index).hasnull(); if (has_null) { const pax::porc::proto::Stream &non_null_stream = @@ -909,9 +917,7 @@ std::unique_ptr OrcFormatReader::ReadStripe( reinterpret_cast(data_buffer->GetAvailableBuffer()); Assert(non_null_stream.kind() == pax::porc::proto::Stream_Kind_PRESENT); - non_null_bitmap = - std::make_shared(BitmapRaw(bm_bytes, bm_nbytes), - BitmapTpl::ReadOnlyRefBitmap); + non_null_bitmap = std::make_unique(BitmapRaw(bm_bytes, bm_nbytes)); data_buffer->Brush(bm_nbytes); } @@ -1023,7 +1029,7 @@ std::unique_ptr OrcFormatReader::ReadStripe( last_column->SetRows(stripe_info.numberofrows()); if (has_null) { Assert(non_null_bitmap); - last_column->SetBitmap(non_null_bitmap); + last_column->SetBitmap(std::move(non_null_bitmap)); } if (!column_attrs_[index].empty()) { diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_format_reader.h b/contrib/pax_storage/src/cpp/storage/orc/orc_format_reader.h index 2c48b762955..003a780ca7b 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_format_reader.h +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_format_reader.h @@ -38,7 +38,7 @@ class OrcDumpReader; } class OrcFormatReader final { public: - explicit OrcFormatReader(std::shared_ptr file, std::shared_ptr toast_file = nullptr); + explicit OrcFormatReader(std::unique_ptr file, std::unique_ptr toast_file = nullptr); ~OrcFormatReader(); @@ -78,8 +78,8 @@ class OrcFormatReader final { friend class OrcGroupStatsProvider; std::vector column_types_; std::vector> column_attrs_; - std::shared_ptr file_; - std::shared_ptr toast_file_; + std::unique_ptr file_; + std::unique_ptr toast_file_; std::shared_ptr> reused_buffer_; size_t num_of_stripes_; bool is_vec_; diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_group.cc b/contrib/pax_storage/src/cpp/storage/orc/orc_group.cc index a183c445db5..d7c8c73d220 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_group.cc +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_group.cc @@ -49,7 +49,7 @@ inline static std::pair GetColumnDatum(PaxColumn *column, Datum rc; if (column->HasNull()) { - auto bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); Assert(bm); if (!bm->Test(row_index)) { *null_counts += 1; @@ -64,7 +64,7 @@ inline static std::pair GetColumnDatum(PaxColumn *column, return {rc, false}; } -OrcGroup::OrcGroup(std::unique_ptr &&pax_column, size_t row_offset, +OrcGroup::OrcGroup(std::unique_ptr pax_column, size_t row_offset, const std::vector *proj_col_index, std::shared_ptr micro_partition_visibility_bitmap) : pax_columns_(std::move(pax_column)), @@ -88,7 +88,7 @@ size_t OrcGroup::GetRows() const { return pax_columns_->GetRows(); } size_t OrcGroup::GetRowOffset() const { return row_offset_; } -const std::shared_ptr &OrcGroup::GetAllColumns() const { +const std::unique_ptr &OrcGroup::GetAllColumns() const { return pax_columns_; } @@ -123,7 +123,7 @@ std::pair OrcGroup::ReadTuple(TupleTableSlot *slot) { } if (column->HasNull()) { - auto bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); Assert(bm); if (!bm->Test(current_row_index_)) { current_nulls_[index]++; @@ -307,7 +307,7 @@ std::pair OrcGroup::GetColumnValueNoMissing(size_t column_index, void OrcGroup::CalcNullShuffle(PaxColumn *column, size_t column_index) { auto rows = column->GetRows(); uint32 n_counts = 0; - auto bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); Assert(bm); Assert(column->HasNull() && !nulls_shuffle_[column_index]); diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_group.h b/contrib/pax_storage/src/cpp/storage/orc/orc_group.h index fb59740c7f4..d8b44092ad8 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_group.h +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_group.h @@ -40,7 +40,7 @@ class OrcDumpReader; class OrcGroup : public MicroPartitionReader::Group { public: OrcGroup( - std::unique_ptr &&pax_column, size_t row_offset, + std::unique_ptr pax_column, size_t row_offset, const std::vector *proj_col_index, std::shared_ptr micro_partition_visibility_bitmap = nullptr); @@ -50,7 +50,7 @@ class OrcGroup : public MicroPartitionReader::Group { size_t GetRowOffset() const override; - const std::shared_ptr &GetAllColumns() const override; + const std::unique_ptr &GetAllColumns() const override; virtual std::pair ReadTuple(TupleTableSlot *slot) override; @@ -74,7 +74,7 @@ class OrcGroup : public MicroPartitionReader::Group { size_t row_index); protected: - std::shared_ptr pax_columns_; + std::unique_ptr pax_columns_; std::shared_ptr micro_partition_visibility_bitmap_; size_t row_offset_; size_t current_row_index_; diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_reader.cc b/contrib/pax_storage/src/cpp/storage/orc/orc_reader.cc index 764158dc96d..3fa4192efc6 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_reader.cc +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_reader.cc @@ -102,12 +102,12 @@ class OrcGroupStatsProvider final : public ColumnStatsProvider { size_t group_index_; }; -OrcReader::OrcReader(std::shared_ptr file, - std::shared_ptr toast_file) +OrcReader::OrcReader(std::unique_ptr file, + std::unique_ptr toast_file) : working_group_(nullptr), cached_group_(nullptr), current_group_index_(0), - format_reader_(file, toast_file), + format_reader_(std::move(file), std::move(toast_file)), is_closed_(true) {} std::unique_ptr OrcReader::GetGroupStatsInfo( @@ -129,7 +129,7 @@ std::unique_ptr OrcReader::ReadGroup( for (size_t i = 0; i < pax_columns->GetColumns(); i++) { auto column = (*pax_columns)[i].get(); if (column && !column->GetBuffer().first) { - auto bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); // Assert(bm); if (bm) { for (size_t n = 0; n < column->GetRows(); n++) { diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_test.cc b/contrib/pax_storage/src/cpp/storage/orc/orc_test.cc index 5be2a860d58..006efebbbbf 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_test.cc +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_test.cc @@ -144,7 +144,7 @@ TEST_F(OrcTest, WriteTuple) { ASSERT_NE(nullptr, local_fs); auto file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + EXPECT_NE(nullptr, file_ptr.get()); OrcWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; @@ -164,7 +164,7 @@ TEST_F(OrcTest, OpenOrc) { ASSERT_NE(nullptr, local_fs); auto file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; @@ -192,8 +192,8 @@ TEST_F(OrcTest, WriteReadStripes) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; @@ -201,7 +201,7 @@ TEST_F(OrcTest, WriteReadStripes) { // file_ptr in orc writer will be freed when writer do destruct // current OrcWriter::CreateWriter only for test auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); writer->WriteTuple(tuple_slot); writer->Close(); @@ -210,7 +210,7 @@ TEST_F(OrcTest, WriteReadStripes) { // file_ptr in orc reader will be freed when reader do destruct MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); @@ -229,13 +229,13 @@ TEST_F(OrcTest, WriteReadStripesTwice) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); writer->WriteTuple(tuple_slot); writer->WriteTuple(tuple_slot); @@ -244,7 +244,7 @@ TEST_F(OrcTest, WriteReadStripesTwice) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); @@ -280,14 +280,14 @@ TEST_F(OrcTest, WriteReadMultiStripes) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); writer->WriteTuple(tuple_slot); writer->Flush(); @@ -298,7 +298,7 @@ TEST_F(OrcTest, WriteReadMultiStripes) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(2UL, reader->GetGroupNums()); @@ -319,14 +319,14 @@ TEST_F(OrcTest, WriteReadCloseEmptyOrc) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); writer->WriteTuple(tuple_slot); writer->Flush(); @@ -336,7 +336,7 @@ TEST_F(OrcTest, WriteReadCloseEmptyOrc) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); @@ -353,14 +353,14 @@ TEST_F(OrcTest, WriteReadEmptyOrc) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); // flush empty writer->Flush(); // direct close @@ -369,7 +369,7 @@ TEST_F(OrcTest, WriteReadEmptyOrc) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(0UL, reader->GetGroupNums()); reader->Close(); @@ -382,14 +382,14 @@ TEST_F(OrcTest, ReadTuple) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(false); writer->WriteTuple(tuple_slot); @@ -398,7 +398,7 @@ TEST_F(OrcTest, ReadTuple) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); reader->ReadTuple(tuple_slot_empty); @@ -415,15 +415,15 @@ TEST_F(OrcTest, GetTuple) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; writer_options.group_limit = 100; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); for (int i = 0; i < 1000; i++) { if (i % 5 == 0) { tuple_slot->tts_isnull[0] = true; @@ -440,7 +440,7 @@ TEST_F(OrcTest, GetTuple) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(false); reader->Open(reader_options); @@ -487,16 +487,13 @@ TEST_F(OrcTest, WriteReadTupleWithToast) { pax_min_size_of_external_toast = 1024; tuple_desc->natts = TOAST_COLUMN_NUMS; - tuple_desc->attrs[0] = {.atttypid = TEXTOID, - .attlen = -1, - .attbyval = false, - .attalign = TYPALIGN_DOUBLE, - .attstorage = TYPSTORAGE_EXTENDED, - .attisdropped = false, - .attcollation = DEFAULT_COLLATION_OID}; - tuple_desc->attrs[1] = tuple_desc->attrs[0]; - tuple_desc->attrs[2] = tuple_desc->attrs[0]; - tuple_desc->attrs[3] = tuple_desc->attrs[0]; + InitAttribute_text(&tuple_desc->attrs[0]); + InitAttribute_text(&tuple_desc->attrs[1]); + InitAttribute_text(&tuple_desc->attrs[2]); + InitAttribute_text(&tuple_desc->attrs[3]); + tuple_desc->attrs[0].attstorage = TYPSTORAGE_EXTENDED; + tuple_desc->attrs[1].attstorage = TYPSTORAGE_EXTENDED; + tuple_desc->attrs[2].attstorage = TYPSTORAGE_EXTENDED; // column 4 is external but no compress tuple_desc->attrs[3].attstorage = TYPSTORAGE_EXTERNAL; @@ -524,11 +521,11 @@ TEST_F(OrcTest, WriteReadTupleWithToast) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); - std::shared_ptr toast_file_ptr = local_fs->Open(toast_file_name, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr toast_file_ptr = local_fs->Open(toast_file_name, fs::kWriteMode); + EXPECT_NE(nullptr, toast_file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; @@ -542,7 +539,7 @@ TEST_F(OrcTest, WriteReadTupleWithToast) { std::vector types_for_read = types; auto writer = OrcWriter::CreateWriter(writer_options, std::move(types), - file_ptr, toast_file_ptr); + std::move(file_ptr), std::move(toast_file_ptr)); for (int i = 0; i < 106; i++) { switch (i % 3) { case 0: { @@ -577,12 +574,12 @@ TEST_F(OrcTest, WriteReadTupleWithToast) { // begin full read without projection file_ptr = local_fs->Open(file_name_, fs::kReadMode); - EXPECT_NE(nullptr, file_ptr); + EXPECT_NE(nullptr, file_ptr.get()); toast_file_ptr = local_fs->Open(toast_file_name, fs::kReadMode); - EXPECT_NE(nullptr, file_ptr); + EXPECT_NE(nullptr, toast_file_ptr.get()); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr, toast_file_ptr); + auto reader = new OrcReader(std::move(file_ptr), std::move(toast_file_ptr)); tuple_slot_empty = MakeTupleTableSlot(tuple_desc, &TTSOpsVirtual); reader->Open(reader_options); @@ -660,16 +657,16 @@ TEST_F(OrcTest, WriteReadTupleWithToast) { // begin read with projection file_ptr = local_fs->Open(file_name_, fs::kReadMode); - EXPECT_NE(nullptr, file_ptr); + EXPECT_NE(nullptr, file_ptr.get()); toast_file_ptr = local_fs->Open(toast_file_name, fs::kReadMode); - EXPECT_NE(nullptr, file_ptr); + EXPECT_NE(nullptr, toast_file_ptr.get()); std::vector projection = {false, true, true, true}; std::shared_ptr filter = std::make_shared(); filter->SetColumnProjection(std::move(projection)); reader_options.filter = filter; - reader = new OrcReader(file_ptr, toast_file_ptr); + reader = new OrcReader(std::move(file_ptr), std::move(toast_file_ptr)); reader->Open(reader_options); EXPECT_EQ(6UL, reader->GetGroupNums()); @@ -763,19 +760,8 @@ TEST_P(OrcEncodingTest, ReadTupleWithEncoding) { cbdb::Palloc0(sizeof(TupleDescData) + sizeof(FormData_pg_attribute) * 2)); tuple_desc->natts = 2; - tuple_desc->attrs[0] = { - .attlen = 8, - .attbyval = true, - .attalign = TYPALIGN_DOUBLE, - .attstorage = TYPSTORAGE_PLAIN, - }; - - tuple_desc->attrs[1] = { - .attlen = 8, - .attbyval = true, - .attalign = TYPALIGN_DOUBLE, - .attstorage = TYPSTORAGE_PLAIN, - }; + InitAttribute_int8(&tuple_desc->attrs[0]); + InitAttribute_int8(&tuple_desc->attrs[1]); tuple_slot = MakeTupleTableSlot(tuple_desc, &TTSOpsVirtual); bool *fake_is_null = @@ -790,8 +776,8 @@ TEST_P(OrcEncodingTest, ReadTupleWithEncoding) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); std::vector types; types.emplace_back(pax::porc::proto::Type_Kind::Type_Kind_LONG); @@ -803,7 +789,7 @@ TEST_P(OrcEncodingTest, ReadTupleWithEncoding) { writer_options.encoding_opts = types_encoding; writer_options.rel_tuple_desc = tuple_desc; - auto writer = new OrcWriter(writer_options, types, file_ptr); + auto writer = new OrcWriter(writer_options, types, std::move(file_ptr)); for (size_t i = 0; i < 10000; i++) { tuple_slot->tts_values[0] = Int64GetDatum(i); @@ -816,7 +802,7 @@ TEST_P(OrcEncodingTest, ReadTupleWithEncoding) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); for (size_t i = 0; i < 10000; i++) { @@ -847,19 +833,8 @@ TEST_P(OrcCompressTest, ReadTupleWithCompress) { cbdb::Palloc0(sizeof(TupleDescData) + sizeof(FormData_pg_attribute) * 2)); tuple_desc->natts = 2; - tuple_desc->attrs[0] = { - .attlen = -1, - .attbyval = false, - .attalign = TYPALIGN_DOUBLE, - .attstorage = TYPSTORAGE_PLAIN, - }; - - tuple_desc->attrs[1] = { - .attlen = -1, - .attbyval = false, - .attalign = TYPALIGN_DOUBLE, - .attstorage = TYPSTORAGE_PLAIN, - }; + InitAttribute_text(&tuple_desc->attrs[0]); + InitAttribute_text(&tuple_desc->attrs[1]); tuple_slot = MakeTupleTableSlot(tuple_desc, &TTSOpsVirtual); bool *fake_is_null = @@ -872,8 +847,8 @@ TEST_P(OrcCompressTest, ReadTupleWithCompress) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); std::vector types; types.emplace_back(pax::porc::proto::Type_Kind::Type_Kind_STRING); @@ -885,7 +860,7 @@ TEST_P(OrcCompressTest, ReadTupleWithCompress) { writer_options.encoding_opts = types_encoding; writer_options.rel_tuple_desc = tuple_desc; - auto writer = new OrcWriter(writer_options, types, file_ptr); + auto writer = new OrcWriter(writer_options, types, std::move(file_ptr)); for (size_t i = 0; i < COLUMN_SIZE; i++) { column_buff_str[i] = i; @@ -903,7 +878,7 @@ TEST_P(OrcCompressTest, ReadTupleWithCompress) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); ASSERT_EQ(1UL, reader->GetGroupNums()); @@ -943,14 +918,14 @@ TEST_F(OrcTest, ReadTupleDefaultColumn) { auto *local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); writer->WriteTuple(tuple_slot); writer->Close(); @@ -958,16 +933,12 @@ TEST_F(OrcTest, ReadTupleDefaultColumn) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(false, 4); - tuple_slot_empty->tts_tupleDescriptor->attrs[3] = { - .attlen = 4, - .attbyval = true, - .attstorage = TYPSTORAGE_PLAIN, - }; + InitAttribute_int4(&tuple_slot_empty->tts_tupleDescriptor->attrs[3]); tuple_slot_empty->tts_tupleDescriptor->natts = COLUMN_NUMS + 1; @@ -998,14 +969,14 @@ TEST_F(OrcTest, ReadTupleDroppedColumn) { auto *local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); writer->WriteTuple(tuple_slot); writer->Close(); @@ -1013,7 +984,7 @@ TEST_F(OrcTest, ReadTupleDroppedColumn) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(false); @@ -1033,21 +1004,21 @@ TEST_F(OrcTest, ReadTupleDroppedColumnWithProjection) { auto *local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); writer->WriteTuple(tuple_slot); writer->Close(); file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(false); @@ -1071,18 +1042,8 @@ TEST_F(OrcTest, WriteReadBigTuple) { cbdb::Palloc0(sizeof(TupleDescData) + sizeof(FormData_pg_attribute) * 2)); tuple_desc->natts = 2; - tuple_desc->attrs[0] = { - .attlen = 4, - .attbyval = true, - .attalign = TYPALIGN_INT, - .attstorage = TYPSTORAGE_PLAIN, - }; - tuple_desc->attrs[1] = { - .attlen = 4, - .attbyval = true, - .attalign = TYPALIGN_INT, - .attstorage = TYPSTORAGE_PLAIN, - }; + InitAttribute_int4(&tuple_desc->attrs[0]); + InitAttribute_int4(&tuple_desc->attrs[1]); tuple_slot = MakeTupleTableSlot(tuple_desc, &TTSOpsVirtual); bool *fake_is_null = @@ -1097,8 +1058,8 @@ TEST_F(OrcTest, WriteReadBigTuple) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); std::vector types; types.emplace_back(pax::porc::proto::Type_Kind::Type_Kind_INT); @@ -1106,7 +1067,7 @@ TEST_F(OrcTest, WriteReadBigTuple) { MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_desc; - auto writer = OrcWriter::CreateWriter(writer_options, types, file_ptr); + auto writer = OrcWriter::CreateWriter(writer_options, types, std::move(file_ptr)); for (size_t i = 0; i < 10000; i++) { tuple_slot->tts_values[0] = Int32GetDatum(i); @@ -1119,7 +1080,7 @@ TEST_F(OrcTest, WriteReadBigTuple) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); for (size_t i = 0; i < 10000; i++) { @@ -1141,14 +1102,14 @@ TEST_F(OrcTest, WriteReadNoFixedColumnInSameTuple) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); writer->WriteTuple(tuple_slot); @@ -1164,7 +1125,7 @@ TEST_F(OrcTest, WriteReadNoFixedColumnInSameTuple) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(1UL, reader->GetGroupNums()); @@ -1193,14 +1154,14 @@ TEST_F(OrcTest, WriteReadWithNullField) { auto *local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); OrcWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); // str str int // null null int @@ -1228,7 +1189,7 @@ TEST_F(OrcTest, WriteReadWithNullField) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(false); @@ -1270,14 +1231,14 @@ TEST_F(OrcTest, WriteReadWithBoundNullField) { auto *local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); OrcWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); // null null null // str str int @@ -1302,7 +1263,7 @@ TEST_F(OrcTest, WriteReadWithBoundNullField) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(false); @@ -1336,14 +1297,14 @@ TEST_F(OrcTest, WriteReadWithALLNullField) { auto *local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); OrcWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); tuple_slot->tts_isnull[0] = true; tuple_slot->tts_isnull[1] = true; @@ -1356,7 +1317,7 @@ TEST_F(OrcTest, WriteReadWithALLNullField) { file_ptr = local_fs->Open(file_name_, fs::kReadMode); MicroPartitionReader::ReaderOptions reader_options; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); TupleTableSlot *tuple_slot_empty = CreateTestTupleTableSlot(false); @@ -1392,14 +1353,14 @@ TEST_P(OrcTestProjection, ReadTupleWithProjectionColumn) { proj_map[proj_index] = !proj_map[proj_index]; } - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); MicroPartitionWriter::WriterOptions writer_options; writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); writer->WriteTuple(tuple_slot); writer->Flush(); @@ -1414,7 +1375,7 @@ TEST_P(OrcTestProjection, ReadTupleWithProjectionColumn) { MicroPartitionReader::ReaderOptions reader_options; reader_options.filter = pax_filter; - auto reader = new OrcReader(file_ptr); + auto reader = new OrcReader(std::move(file_ptr)); reader->Open(reader_options); EXPECT_EQ(2UL, reader->GetGroupNums()); @@ -1453,9 +1414,9 @@ TEST_P(OrcEncodingTest, WriterMerge) { ASSERT_NE(nullptr, local_fs); - std::shared_ptr file1_ptr = local_fs->Open(file1_name, fs::kReadWriteMode); - std::shared_ptr file2_ptr = local_fs->Open(file2_name, fs::kReadWriteMode); - std::shared_ptr file3_ptr = local_fs->Open(file3_name, fs::kReadWriteMode); + std::unique_ptr file1_ptr = local_fs->Open(file1_name, fs::kReadWriteMode); + std::unique_ptr file2_ptr = local_fs->Open(file2_name, fs::kReadWriteMode); + std::unique_ptr file3_ptr = local_fs->Open(file3_name, fs::kReadWriteMode); EXPECT_NE(nullptr, file1_ptr); EXPECT_NE(nullptr, file2_ptr); EXPECT_NE(nullptr, file3_ptr); @@ -1470,11 +1431,11 @@ TEST_P(OrcEncodingTest, WriterMerge) { writer_options.rel_tuple_desc = tuple_slot->tts_tupleDescriptor; auto *writer1 = new OrcWriter(writer_options, - std::move(CreateTestSchemaTypes()), file1_ptr); + std::move(CreateTestSchemaTypes()), std::move(file1_ptr)); auto *writer2 = new OrcWriter(writer_options, - std::move(CreateTestSchemaTypes()), file2_ptr); + std::move(CreateTestSchemaTypes()), std::move(file2_ptr)); auto *writer3 = new OrcWriter(writer_options, - std::move(CreateTestSchemaTypes()), file3_ptr); + std::move(CreateTestSchemaTypes()), std::move(file3_ptr)); // two group + 51 rows in memory for (size_t i = 0; i < 251; i++) { @@ -1511,7 +1472,7 @@ TEST_P(OrcEncodingTest, WriterMerge) { MicroPartitionReader::ReaderOptions reader_options; file3_ptr = local_fs->Open(file3_name, fs::kReadMode); - auto reader = new OrcReader(file3_ptr); + auto reader = new OrcReader(std::move(file3_ptr)); reader->Open(reader_options); // no memory merge @@ -1664,8 +1625,8 @@ TEST_F(OrcTest, ReadException) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); - EXPECT_NE(nullptr, file_ptr); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + EXPECT_NE(nullptr, file_ptr.get()); current_pb_func_call_times = 0; target_pb_func_call_times = 0; @@ -1675,7 +1636,7 @@ TEST_F(OrcTest, ReadException) { writer_options.group_limit = 10; auto writer = OrcWriter::CreateWriter( - writer_options, std::move(CreateTestSchemaTypes()), file_ptr); + writer_options, std::move(CreateTestSchemaTypes()), std::move(file_ptr)); for (int i = 0; i < 50; i++) { writer->WriteTuple(tuple_slot); } diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_vec_group.cc b/contrib/pax_storage/src/cpp/storage/orc/orc_vec_group.cc index efeecdccb3c..b8690178109 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_vec_group.cc +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_vec_group.cc @@ -32,7 +32,7 @@ namespace pax { inline static std::pair GetColumnDatum(PaxColumn *column, size_t row_index) { if (column->HasNull()) { - auto bm = column->GetBitmap(); + const auto &bm = column->GetBitmap(); Assert(bm); if (!bm->Test(row_index)) { return {0, true}; diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_vec_test.cc b/contrib/pax_storage/src/cpp/storage/orc/orc_vec_test.cc index 64df3bef184..267150e0729 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_vec_test.cc +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_vec_test.cc @@ -97,7 +97,7 @@ TEST_F(OrcVecTest, WriteReadGroup) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); EXPECT_NE(nullptr, file_ptr); std::vector types; @@ -240,7 +240,7 @@ TEST_F(OrcVecTest, WriteReadGroupWithEncoding) { auto local_fs = Singleton::GetInstance(); ASSERT_NE(nullptr, local_fs); - std::shared_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); + std::unique_ptr file_ptr = local_fs->Open(file_name_, fs::kWriteMode); EXPECT_NE(nullptr, file_ptr); std::vector types; diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_writer.cc b/contrib/pax_storage/src/cpp/storage/orc/orc_writer.cc index d338aeb1473..a33d2de0fc1 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/orc_writer.cc +++ b/contrib/pax_storage/src/cpp/storage/orc/orc_writer.cc @@ -104,7 +104,6 @@ static std::unique_ptr BuildColumns( const std::vector &types, const TupleDesc desc, const std::vector> &column_encoding_types, - const std::pair &offsets_encoding_types, const PaxStorageFormat &storage_format) { std::unique_ptr columns; bool is_vec; @@ -125,14 +124,7 @@ static std::unique_ptr BuildColumns( encoding_option.is_sign = true; encoding_option.compress_level = std::get<1>(column_encoding_types[i]); - if (offsets_encoding_types.first == ColumnEncoding_Kind_DEF_ENCODED) { - // default value of offsets_stream is zstd - encoding_option.offsets_encode_type = ColumnEncoding_Kind_COMPRESS_ZSTD; - encoding_option.offsets_compress_level = 5; - } else { - encoding_option.offsets_encode_type = offsets_encoding_types.first; - encoding_option.offsets_compress_level = offsets_encoding_types.second; - } + encoding_option.offsets_encode_type = ColumnEncoding_Kind_DIRECT_DELTA; switch (type) { case (pax::porc::proto::Type_Kind::Type_Kind_STRING): { @@ -225,12 +217,12 @@ static std::unique_ptr BuildColumns( OrcWriter::OrcWriter( const MicroPartitionWriter::WriterOptions &writer_options, const std::vector &column_types, - std::shared_ptr file, std::shared_ptr toast_file) + std::unique_ptr file, std::unique_ptr toast_file) : MicroPartitionWriter(writer_options), is_closed_(false), column_types_(column_types), - file_(file), - toast_file_(toast_file), + file_(std::move(file)), + toast_file_(std::move(toast_file)), current_written_phy_size_(0), row_index_(0), total_rows_(0), @@ -241,10 +233,9 @@ OrcWriter::OrcWriter( Assert(writer_options.rel_tuple_desc->natts == static_cast(column_types.size())); - pax_columns_ = BuildColumns(column_types_, writer_options.rel_tuple_desc, - writer_options.encoding_opts, - writer_options.offsets_encoding_opts, - writer_options.storage_format); + pax_columns_ = + BuildColumns(column_types_, writer_options.rel_tuple_desc, + writer_options.encoding_opts, writer_options.storage_format); summary_.rel_oid = writer_options.rel_oid; summary_.block_id = writer_options.block_id; @@ -258,6 +249,16 @@ OrcWriter::OrcWriter( group_stats_.Initialize(writer_options.enable_min_max_col_idxs, writer_options.enable_bf_col_idxs); + + // Precompute slowpath indices for varlena columns (non-byval and typlen == -1) + varlena_slowpath_indices_.clear(); + varlena_slowpath_indices_.reserve(writer_options.rel_tuple_desc->natts); + for (int i = 0; i < writer_options.rel_tuple_desc->natts; ++i) { + auto attrs = TupleDescAttr(writer_options.rel_tuple_desc, i); + if (!attrs->attbyval && attrs->attlen == -1) { + varlena_slowpath_indices_.push_back(i); + } + } } OrcWriter::~OrcWriter() {} @@ -300,7 +301,6 @@ void OrcWriter::Flush() { new_columns = BuildColumns(column_types_, writer_options_.rel_tuple_desc, writer_options_.encoding_opts, - writer_options_.offsets_encoding_opts, writer_options_.storage_format); for (size_t i = 0; i < column_types_.size(); ++i) { @@ -321,8 +321,6 @@ void OrcWriter::Flush() { std::vector> OrcWriter::PrepareWriteTuple( TupleTableSlot *table_slot) { TupleDesc tuple_desc; - int16 type_len; - bool type_by_val; bool is_null; Datum tts_value; char type_storage; @@ -333,18 +331,16 @@ std::vector> OrcWriter::PrepareWriteTuple( Assert(tuple_desc); const auto &required_stats_cols = group_stats_.GetRequiredStatsColsMask(); - for (int i = 0; i < tuple_desc->natts; i++) { + for (int i : varlena_slowpath_indices_) { bool save_origin_datum; auto attrs = TupleDescAttr(tuple_desc, i); - type_len = attrs->attlen; - type_by_val = attrs->attbyval; is_null = table_slot->tts_isnull[i]; tts_value = table_slot->tts_values[i]; type_storage = attrs->attstorage; AssertImply(attrs->attisdropped, is_null); - if (is_null || type_by_val || type_len != -1) { + if (is_null) { continue; } @@ -371,7 +367,11 @@ std::vector> OrcWriter::PrepareWriteTuple( // Numeric always need ensure that with the 4B header, otherwise it will // be converted twice in the vectorization path. if (required_stats_cols[i] || VARATT_IS_COMPRESSED(tts_value_vl) || - VARATT_IS_EXTERNAL(tts_value_vl) || attrs->atttypid == NUMERICOID) { + VARATT_IS_EXTERNAL(tts_value_vl) +#ifdef VEC_BUILD + || attrs->atttypid == NUMERICOID +#endif + ) { // still detoast the origin toast detoast_vl = cbdb::PgDeToastDatum(tts_value_vl); Assert(detoast_vl != nullptr); diff --git a/contrib/pax_storage/src/cpp/storage/orc/porc.h b/contrib/pax_storage/src/cpp/storage/orc/porc.h index d846af6b33b..69a10ffaad7 100644 --- a/contrib/pax_storage/src/cpp/storage/orc/porc.h +++ b/contrib/pax_storage/src/cpp/storage/orc/porc.h @@ -50,7 +50,7 @@ class OrcWriter : public MicroPartitionWriter { public: OrcWriter(const MicroPartitionWriter::WriterOptions &orc_writer_options, const std::vector &column_types, - std::shared_ptr file, std::shared_ptr toast_file = nullptr); + std::unique_ptr file, std::unique_ptr toast_file = nullptr); ~OrcWriter() override; @@ -75,8 +75,8 @@ class OrcWriter : public MicroPartitionWriter { // only for test static std::unique_ptr CreateWriter( MicroPartitionWriter::WriterOptions options, - const std::vector &column_types, std::shared_ptr file, - std::shared_ptr toast_file = nullptr) { + const std::vector &column_types, std::unique_ptr file, + std::unique_ptr toast_file = nullptr) { std::vector> all_no_encoding_types; for (auto _ : column_types) { (void)_; @@ -86,7 +86,7 @@ class OrcWriter : public MicroPartitionWriter { options.encoding_opts = all_no_encoding_types; - return std::make_unique(options, column_types, file, toast_file); + return std::make_unique(options, column_types, std::move(file), std::move(toast_file)); } #endif @@ -125,8 +125,8 @@ class OrcWriter : public MicroPartitionWriter { std::vector detoast_memory_holder_; const std::vector column_types_; - std::shared_ptr file_; - std::shared_ptr toast_file_; + std::unique_ptr file_; + std::unique_ptr toast_file_; int32 current_written_phy_size_; WriteSummary summary_; @@ -138,11 +138,14 @@ class OrcWriter : public MicroPartitionWriter { ::pax::porc::proto::Footer file_footer_; ::pax::porc::proto::PostScript post_script_; ::pax::MicroPartitionStats group_stats_; + + // indices of columns that are non-byval and have typlen == -1 (varlena) + std::vector varlena_slowpath_indices_; }; class OrcReader : public MicroPartitionReader { public: - explicit OrcReader(std::shared_ptr file, std::shared_ptr toast_file = nullptr); + explicit OrcReader(std::unique_ptr file, std::unique_ptr toast_file = nullptr); ~OrcReader() override = default; diff --git a/contrib/pax_storage/src/cpp/storage/pax.cc b/contrib/pax_storage/src/cpp/storage/pax.cc index adc268c0102..69282738f4c 100644 --- a/contrib/pax_storage/src/cpp/storage/pax.cc +++ b/contrib/pax_storage/src/cpp/storage/pax.cc @@ -49,6 +49,8 @@ #include "storage/vec/pax_vec_reader.h" #endif +#define PAX_SPLIT_STRATEGY_CHECK_INTERVAL (16) + namespace paxc { class IndexUpdaterInternal { public: @@ -174,8 +176,8 @@ std::unique_ptr TableWriter::CreateMicroPartitionWriter( std::string file_path; std::string toast_file_path; std::string block_id; - std::shared_ptr file; - std::shared_ptr toast_file; + std::unique_ptr file; + std::unique_ptr toast_file; int open_flags; int block_number; @@ -200,8 +202,6 @@ std::unique_ptr TableWriter::CreateMicroPartitionWriter( options.file_name = std::move(file_path); options.encoding_opts = GetRelEncodingOptions(); options.storage_format = GetStorageFormat(); - options.offsets_encoding_opts = std::make_pair( - PAX_OFFSETS_DEFAULT_COMPRESSTYPE, PAX_OFFSETS_DEFAULT_COMPRESSLEVEL); options.enable_min_max_col_idxs = GetMinMaxColumnIndexes(); options.enable_bf_col_idxs = GetBloomFilterColumnIndexes(); @@ -239,7 +239,7 @@ std::unique_ptr TableWriter::CreateMicroPartitionWriter( } auto mp_writer = MicroPartitionFileFactory::CreateMicroPartitionWriter( - std::move(options), file, toast_file); + std::move(options), std::move(file), std::move(toast_file)); Assert(mp_writer); mp_writer->SetWriteSummaryCallback(summary_callback_) @@ -261,8 +261,8 @@ void TableWriter::InitOptionsCaches() { } void TableWriter::Open() { - rel_path_ = cbdb::BuildPaxDirectoryPath( - relation_->rd_node, relation_->rd_backend); + rel_path_ = + cbdb::BuildPaxDirectoryPath(relation_->rd_node, relation_->rd_backend); InitOptionsCaches(); @@ -282,14 +282,25 @@ void TableWriter::Open() { // insert tuple into the aux table before inserting any tuples. cbdb::InsertMicroPartitionPlaceHolder(RelationGetRelid(relation_), current_blockno_); + cur_physical_size_ = 0; } void TableWriter::WriteTuple(TupleTableSlot *slot) { Assert(writer_); Assert(strategy_); - // should check split strategy before write tuple - // otherwise, may got a empty file in the disk - if (strategy_->ShouldSplit(writer_->PhysicalSize(), num_tuples_)) { + // Because of the CTID constraint, we have to strictly enforce the accuracy of + // the tuple count and make sure it doesn't exceed + // PAX_MAX_NUM_TUPLES_PER_FILE. That's why we kept this precise check here. + + // On the other hand,the biggest performance hit here is the PhysicalSize() + // function.So to reduce the overhead of calling it so often, + // we only update the file size every PAX_SPLIT_STRATEGY_CHECK_INTERVAL + // tuples. + if ((num_tuples_ % PAX_SPLIT_STRATEGY_CHECK_INTERVAL) == 0) { + cur_physical_size_ = writer_->PhysicalSize(); + } + + if (strategy_->ShouldSplit(cur_physical_size_, num_tuples_)) { writer_->Close(); writer_ = nullptr; Open(); @@ -374,7 +385,7 @@ bool TableReader::GetTuple(TupleTableSlot *slot, ScanDirection direction, size_t row_index = current_block_row_index_; size_t max_row_index; size_t remaining_offset = offset; - std::shared_ptr toast_file; + std::unique_ptr toast_file; bool ok; if (!reader_) { @@ -460,7 +471,7 @@ bool TableReader::GetTuple(TupleTableSlot *slot, ScanDirection direction, reader_ = MicroPartitionFileFactory::CreateMicroPartitionReader( std::move(options), reader_flags, file_system_->Open(current_block_metadata_.GetFileName(), fs::kReadMode), - toast_file); + std::move(toast_file)); // row_index start from 0, so row_index = offset -1 current_block_row_index_ = remaining_offset - 1; @@ -476,7 +487,7 @@ void TableReader::OpenFile() { auto it = iterator_->Next(); current_block_metadata_ = it; MicroPartitionReader::ReaderOptions options; - std::shared_ptr toast_file; + std::unique_ptr toast_file; int32 reader_flags = FLAGS_EMPTY; micro_partition_id_ = it.GetMicroPartitionId(); @@ -509,14 +520,14 @@ void TableReader::OpenFile() { if (it.GetExistToast()) { // must exist the file in disk - toast_file = file_system_->Open(it.GetFileName() + TOAST_FILE_SUFFIX, - fs::kReadMode); + toast_file = + file_system_->Open(it.GetFileName() + TOAST_FILE_SUFFIX, fs::kReadMode); } reader_ = MicroPartitionFileFactory::CreateMicroPartitionReader( std::move(options), reader_flags, file_system_->Open(it.GetFileName(), fs::kReadMode), - toast_file); + std::move(toast_file)); } TableDeleter::TableDeleter( @@ -534,7 +545,7 @@ void TableDeleter::UpdateStatsInAuxTable( const std::vector &min_max_col_idxs, const std::vector &bf_col_idxs, std::shared_ptr filter) { MicroPartitionReader::ReaderOptions options; - std::shared_ptr toast_file; + std::unique_ptr toast_file; int32 reader_flags = FLAGS_EMPTY; TupleTableSlot *slot; @@ -553,7 +564,7 @@ void TableDeleter::UpdateStatsInAuxTable( std::move(options), reader_flags, file_system_->Open(meta.GetFileName(), fs::kReadMode, file_system_options_), - toast_file); + std::move(toast_file)); slot = MakeTupleTableSlot(rel_->rd_att, &TTSOpsVirtual); auto updated_stats = MicroPartitionStatsUpdater(mp_reader.get(), visi_bitmap) @@ -588,8 +599,7 @@ void TableDeleter::DeleteWithVisibilityMap( std::unique_ptr visi_bitmap; auto catalog_update = pax::PaxCatalogUpdater::Begin(rel_); - auto rel_path = cbdb::BuildPaxDirectoryPath( - rel_->rd_node, rel_->rd_backend); + auto rel_path = cbdb::BuildPaxDirectoryPath(rel_->rd_node, rel_->rd_backend); min_max_col_idxs = cbdb::GetMinMaxColumnIndexes(rel_); stats_updater_projection->SetColumnProjection(min_max_col_idxs, @@ -615,8 +625,7 @@ void TableDeleter::DeleteWithVisibilityMap( auto buffer = LoadVisimap(file_system_, file_system_options_, visibility_map_filename); auto visibility_file_bitmap = - Bitmap8(BitmapRaw(buffer->data(), buffer->size()), - Bitmap8::ReadOnlyOwnBitmap); + Bitmap8(BitmapRaw(buffer->data(), buffer->size())); visi_bitmap = Bitmap8::Union(&visibility_file_bitmap, delete_visi_bitmap.get()); @@ -663,12 +672,10 @@ void TableDeleter::DeleteWithVisibilityMap( // TODO: update stats and visimap all in one catalog update // Update the stats in pax aux table // Notice that: PAX won't update the stats in group - UpdateStatsInAuxTable(catalog_update, micro_partition_metadata, - std::make_shared(visi_bitmap->Raw(), - Bitmap8::ReadOnlyOwnBitmap), - min_max_col_idxs, - cbdb::GetBloomFilterColumnIndexes(rel_), - stats_updater_projection); + UpdateStatsInAuxTable( + catalog_update, micro_partition_metadata, + std::make_shared(visi_bitmap->Raw()), min_max_col_idxs, + cbdb::GetBloomFilterColumnIndexes(rel_), stats_updater_projection); // write pg_pax_blocks_oid catalog_update.UpdateVisimap(block_id, visimap_file_name); diff --git a/contrib/pax_storage/src/cpp/storage/pax.h b/contrib/pax_storage/src/cpp/storage/pax.h index 1d7a2f6b3fc..fec97e613f3 100644 --- a/contrib/pax_storage/src/cpp/storage/pax.h +++ b/contrib/pax_storage/src/cpp/storage/pax.h @@ -131,6 +131,7 @@ class TableWriter { std::vector> encoding_opts_; bool is_dfs_table_space_; + size_t cur_physical_size_ = 0; }; class TableReader final { diff --git a/contrib/pax_storage/src/cpp/storage/pax_defined.h b/contrib/pax_storage/src/cpp/storage/pax_defined.h index b4ce1115af8..5315797ea3a 100644 --- a/contrib/pax_storage/src/cpp/storage/pax_defined.h +++ b/contrib/pax_storage/src/cpp/storage/pax_defined.h @@ -39,7 +39,7 @@ namespace pax { #define BITS_TO_BYTES(bits) (((bits) + 7) / 8) #define PAX_OFFSETS_DEFAULT_COMPRESSTYPE \ - ColumnEncoding_Kind::ColumnEncoding_Kind_COMPRESS_ZSTD + ColumnEncoding_Kind::ColumnEncoding_Kind_DIRECT_DELTA #define PAX_OFFSETS_DEFAULT_COMPRESSLEVEL 5 #define COLUMN_STORAGE_FORMAT_IS_VEC(column) \ diff --git a/contrib/pax_storage/src/cpp/storage/proto/pax.proto b/contrib/pax_storage/src/cpp/storage/proto/pax.proto index 3e25710027d..765d3e0f8a5 100644 --- a/contrib/pax_storage/src/cpp/storage/proto/pax.proto +++ b/contrib/pax_storage/src/cpp/storage/proto/pax.proto @@ -37,6 +37,7 @@ message ColumnEncoding { COMPRESS_ZLIB = 4; // use ZLIB to compress DICTIONARY = 5; // use dict-endoing + COMPRESS_LZ4 = 6; // use lz4 to compress } optional Kind kind = 1; diff --git a/contrib/pax_storage/src/cpp/storage/toast/pax_toast.cc b/contrib/pax_storage/src/cpp/storage/toast/pax_toast.cc index 3aa54e48ea5..7da38f6ae06 100644 --- a/contrib/pax_storage/src/cpp/storage/toast/pax_toast.cc +++ b/contrib/pax_storage/src/cpp/storage/toast/pax_toast.cc @@ -331,7 +331,7 @@ static std::pair> pax_make_external_toast( std::pair> pax_make_toast( Datum d, char storage_type) { std::shared_ptr mobj; - Datum result; + Datum result = d; if (!pax_enable_toast) { return {d, nullptr}; @@ -495,15 +495,15 @@ size_t pax_detoast_raw(Datum d, char *dst_buff, size_t dst_cap, char *ext_buff, return decompress_size; } -std::pair> pax_detoast( +std::pair> pax_detoast( Datum d, char *ext_buff, size_t ext_buff_size) { - std::shared_ptr value; + std::unique_ptr value; if (VARATT_IS_COMPRESSED(d)) { char *result; size_t raw_size = VARDATA_COMPRESSED_GET_EXTSIZE(d); - value = std::make_shared(raw_size + VARHDRSZ); + value = std::make_unique(raw_size + VARHDRSZ); result = reinterpret_cast(value->Addr()); // only external toast exist invalid compress toast Assert((ToastCompressionId)(TOAST_COMPRESS_METHOD(d)) != @@ -515,8 +515,8 @@ std::pair> pax_detoast( SET_VARSIZE(result, raw_size + VARHDRSZ); - return std::pair>{ - PointerGetDatum(result), value}; + return std::pair>{ + PointerGetDatum(result), std::move(value)}; } else if (VARATT_IS_PAX_EXTERNAL_TOAST(d)) { char *result; Assert(ext_buff); @@ -531,7 +531,7 @@ std::pair> pax_detoast( "buff size=%lu]", offset, raw_size, ext_buff_size)); - value = std::make_shared(origin_size + VARHDRSZ); + value = std::make_unique(origin_size + VARHDRSZ); result = reinterpret_cast(value->Addr()); auto pg_attribute_unused() decompress_size = @@ -540,11 +540,11 @@ std::pair> pax_detoast( Assert(decompress_size == origin_size); SET_VARSIZE(result, origin_size + VARHDRSZ); - return std::pair>{ - PointerGetDatum(result), value}; + return std::pair>{ + PointerGetDatum(result), std::move(value)}; } - return std::pair>{d, nullptr}; + return std::pair>{d, nullptr}; } ExternalToastValue::ExternalToastValue(size_t size) diff --git a/contrib/pax_storage/src/cpp/storage/toast/pax_toast.h b/contrib/pax_storage/src/cpp/storage/toast/pax_toast.h index 74d5271b99f..f7825b45256 100644 --- a/contrib/pax_storage/src/cpp/storage/toast/pax_toast.h +++ b/contrib/pax_storage/src/cpp/storage/toast/pax_toast.h @@ -145,7 +145,7 @@ size_t pax_toast_hdr_size(Datum d); // detoast pax toast size_t pax_detoast_raw(Datum d, char *dst_buff, size_t dst_size, char *ext_buff = nullptr, size_t ext_buff_size = 0); -std::pair> pax_detoast( +std::pair> pax_detoast( Datum d, char *ext_buff = nullptr, size_t ext_buff_size = 0); // free pax toast diff --git a/contrib/pax_storage/src/cpp/storage/vec/pax_porc_adpater.cc b/contrib/pax_storage/src/cpp/storage/vec/pax_porc_adpater.cc index 3e77c04bc1a..7545e053b6e 100644 --- a/contrib/pax_storage/src/cpp/storage/vec/pax_porc_adpater.cc +++ b/contrib/pax_storage/src/cpp/storage/vec/pax_porc_adpater.cc @@ -29,6 +29,8 @@ #ifdef VEC_BUILD +#include + #include "comm/vec_numeric.h" #include "storage/columns/pax_column_traits.h" #include "storage/toast/pax_toast.h" @@ -38,6 +40,22 @@ #endif namespace pax { +static inline struct varlena *VarlenaShortTo4B(struct varlena *attr) { + Assert(attr != nullptr); + Assert(VARATT_IS_SHORT(attr)); + Size data_size = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT; + Size new_size = data_size + VARHDRSZ; + + struct varlena *new_attr = + reinterpret_cast(malloc(new_size)); + + Assert(new_attr != nullptr); + + SET_VARSIZE(new_attr, new_size); + memcpy(VARDATA(new_attr), VARDATA_SHORT(attr), data_size); + return new_attr; +} + static void CopyFixedRawBufferWithNull( PaxColumn *column, std::shared_ptr visibility_map_bitset, size_t bitset_index_begin, size_t range_begin, size_t range_lens, @@ -49,7 +67,7 @@ static void CopyFixedRawBufferWithNull( std::tie(buffer, buffer_len) = column->GetRangeBuffer(data_index_begin, data_range_lens); - auto null_bitmap = column->GetBitmap(); + const auto &null_bitmap = column->GetBitmap(); size_t non_null_offset = 0; size_t type_len = column->GetTypeLength(); for (size_t i = range_begin; i < (range_begin + range_lens); i++) { @@ -121,10 +139,10 @@ static void CopyNonFixedBuffer(PaxColumn *column, size_t dst_offset = out_data_buffer->Used(); char *buffer = nullptr; size_t buffer_len = 0; - - auto null_bitmap = column->GetBitmap(); size_t non_null_offset = 0; + const auto &null_bitmap = column->GetBitmap(); + for (size_t i = range_begin; i < (range_begin + range_lens); i++) { if (visibility_map_bitset && visibility_map_bitset->Test(i - range_begin + bitset_index_begin)) { @@ -219,7 +237,7 @@ static void CopyDecimalBuffer(PaxColumn *column, size_t buffer_len = 0; int32 type_len; - auto null_bitmap = column->GetBitmap(); + const auto &null_bitmap = column->GetBitmap(); type_len = VEC_SHORT_NUMERIC_STORE_BYTES; for (size_t i = range_begin; i < (range_begin + range_lens); i++) { @@ -235,16 +253,22 @@ static void CopyDecimalBuffer(PaxColumn *column, out_data_buffer->Brush(type_len); } else { Numeric numeric; + bool should_free = false; size_t num_len = 0; std::tie(buffer, buffer_len) = column->GetBuffer(data_index_begin + non_null_offset); auto vl = (struct varlena *)DatumGetPointer(buffer); - Assert(!(VARATT_IS_EXTERNAL(vl) || VARATT_IS_COMPRESSED(vl) || - VARATT_IS_SHORT(vl))); + Assert(!(VARATT_IS_EXTERNAL(vl) || VARATT_IS_COMPRESSED(vl))); num_len = VARSIZE_ANY_EXHDR(vl); - // direct cast - numeric = (Numeric)(buffer); + // it has been detoasted in OrcWriter::PrepareWriteTuple, except numeric + // type with short header should be detoasted to 4B header + if (unlikely(VARATT_IS_SHORT(vl))) { + numeric = VarlenaShortTo4B(vl); + should_free = true; + } else { // direct cast + numeric = (Numeric)(buffer); + } char *dest_buff = out_data_buffer->GetAvailableBuffer(); Assert(out_data_buffer->Available() >= (size_t)type_len); @@ -253,6 +277,10 @@ static void CopyDecimalBuffer(PaxColumn *column, (int64 *)(dest_buff + sizeof(int64))); out_data_buffer->Brush(type_len); non_null_offset++; + + if (should_free) { + free(numeric); + } } } @@ -279,11 +307,11 @@ void CopyBitPackedBuffer(PaxColumn *column, std::tie(buffer, buffer_len) = column->GetRangeBuffer(data_index_begin, data_range_lens); - auto null_bitmap = column->GetBitmap(); size_t bit_index = 0; size_t non_null_offset = 0; size_t type_len = column->GetTypeLength(); size_t tuple_offset = group_base_offset + range_begin; + const auto &null_bitmap = column->GetBitmap(); for (size_t i = 0; i < range_lens; i++) { bool is_visible = !visibility_map_bitset || @@ -431,7 +459,7 @@ std::pair VecAdapter::AppendPorcFormat(PaxColumns *columns, vec_cache_buffer_[index].is_dict = true; if (column->HasNull()) { - auto null_bitmap = column->GetBitmap(); + const auto &null_bitmap = column->GetBitmap(); size_t non_null_offset = 0; for (size_t i = 0; i < range_lens; i++) { @@ -576,8 +604,7 @@ std::pair VecAdapter::AppendPorcFormat(PaxColumns *columns, vec_buffer->Set(boolean_buffer, align_size); Bitmap8 vec_bool_bitmap( - BitmapRaw((uint8 *)(boolean_buffer), align_size), - BitmapTpl::ReadOnlyRefBitmap); + BitmapRaw((uint8 *)(boolean_buffer), align_size)); CopyBitPackedBuffer(column, micro_partition_visibility_bitmap_, group_base_offset_, range_begin, range_lens, diff --git a/contrib/pax_storage/src/cpp/storage/vec/pax_vec_comm.cc b/contrib/pax_storage/src/cpp/storage/vec/pax_vec_comm.cc index 160d2928b9b..11f31caad95 100644 --- a/contrib/pax_storage/src/cpp/storage/vec/pax_vec_comm.cc +++ b/contrib/pax_storage/src/cpp/storage/vec/pax_vec_comm.cc @@ -88,12 +88,12 @@ void CopyBitmapBuffer(PaxColumn *column, Assert(!null_bits_buffer->GetBuffer()); null_bits_buffer->Set(BlockBuffer::Alloc(null_align_bytes), null_align_bytes); - auto bitmap = column->GetBitmap(); + const auto &bitmap = column->GetBitmap(); Assert(bitmap); CopyBitmap(bitmap.get(), range_begin, range_lens, null_bits_buffer); *out_visable_null_counts = range_lens - data_range_lens; } else { - auto bitmap = column->GetBitmap(); + const auto &bitmap = column->GetBitmap(); Assert(bitmap); Bitmap8 null_bitmap(out_range_lens); diff --git a/contrib/pax_storage/src/cpp/storage/vec_parallel_common.cc b/contrib/pax_storage/src/cpp/storage/vec_parallel_common.cc index f3ca311628e..7e57b9ae011 100644 --- a/contrib/pax_storage/src/cpp/storage/vec_parallel_common.cc +++ b/contrib/pax_storage/src/cpp/storage/vec_parallel_common.cc @@ -132,7 +132,7 @@ bool PaxFragmentInterface::OpenFile() { InitAdapter(); auto data_file = file_system->Open(m->GetFileName(), fs::kReadMode, desc->GetFileSystemOptions()); - std::shared_ptr toast_file; + std::unique_ptr toast_file; if (auto name = m->GetToastName(); !name.empty()) { toast_file = file_system->Open(name, fs::kReadMode, desc->GetFileSystemOptions()); } diff --git a/contrib/pax_storage/src/cpp/storage/vec_parallel_pax.cc b/contrib/pax_storage/src/cpp/storage/vec_parallel_pax.cc index a93d5d17646..632be28f827 100644 --- a/contrib/pax_storage/src/cpp/storage/vec_parallel_pax.cc +++ b/contrib/pax_storage/src/cpp/storage/vec_parallel_pax.cc @@ -60,7 +60,7 @@ class MicroPartitionInfo : public MicroPartitionInfoProvider { if (!visimap_name.empty()) { visimap = pax::LoadVisimap(file_system, nullptr, visimap_name); BitmapRaw raw(visimap->data(), visimap->size()); - bitmap = std::make_unique(raw, BitmapTpl::ReadOnlyRefBitmap); + bitmap = std::make_unique(raw); } return {std::move(visimap), std::move(bitmap)}; } diff --git a/contrib/pax_storage/src/test/regress/expected/DML_over_joins_optimizer.out b/contrib/pax_storage/src/test/regress/expected/DML_over_joins_optimizer.out index aa87cea2a3e..abdad77c7f7 100644 --- a/contrib/pax_storage/src/test/regress/expected/DML_over_joins_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/DML_over_joins_optimizer.out @@ -1613,22 +1613,21 @@ HINT: For non-partitioned tables, run analyze (). For QUERY PLAN ------------------------------------------------------------------------------ Delete on tab1 - -> Result - -> Redistribute Motion 3:3 (slice1; segments: 3) - Hash Key: tab1.b - -> Hash Join - Hash Cond: (tab2.a = tab1.a) - -> Seq Scan on tab2 - -> Hash - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: tab1.a - -> Hash Join - Hash Cond: (tab3.b = tab1.b) - -> Seq Scan on tab3 - -> Hash - -> Seq Scan on tab1 + -> Hash Join + Hash Cond: (tab3.b = tab1.b) + -> Seq Scan on tab3 + -> Hash + -> Redistribute Motion 3:3 (slice1; segments: 3) + Hash Key: tab1.b + -> Hash Join + Hash Cond: (tab2.a = tab1.a) + -> Seq Scan on tab2 + -> Hash + -> Redistribute Motion 3:3 (slice2; segments: 3) + Hash Key: tab1.a + -> Seq Scan on tab1 Optimizer: GPORCA -(16 rows) +(15 rows) -- ---------------------------------------------------------------------- -- Test: teardown.sql diff --git a/contrib/pax_storage/src/test/regress/expected/autostats.out b/contrib/pax_storage/src/test/regress/expected/autostats.out index 6b184579907..6061799b2b4 100644 --- a/contrib/pax_storage/src/test/regress/expected/autostats.out +++ b/contrib/pax_storage/src/test/regress/expected/autostats.out @@ -11,7 +11,7 @@ -- end_matchignore set gp_autostats_mode=on_change; set gp_autostats_on_change_threshold=9; -set pax_enable_debug = false; +set pax.enable_debug = false; set log_autostats=on; set client_min_messages=log; reset optimizer_trace_fallback; diff --git a/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out b/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out index cc84f9983ff..236f6b93542 100644 --- a/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/bfv_joins_optimizer.out @@ -2982,27 +2982,27 @@ ON (member_group.group_id IN (12,13,14,15) AND member_subgroup.subgroup_name = r QUERY PLAN --------------------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) - -> Hash Join - Hash Cond: (member."group_id" = member_group."group_id") - -> Seq Scan on member - -> Hash - -> Broadcast Motion 3:3 (slice2; segments: 3) - -> Hash Left Join - Hash Cond: (member_subgroup.subgroup_name = (region.county_name)::text) - Join Filter: (member_group."group_id" = ANY ('{12,13,14,15}'::integer[])) - -> Redistribute Motion 3:3 (slice3; segments: 3) - Hash Key: member_subgroup.subgroup_name - -> Hash Join - Hash Cond: (member_subgroup."group_id" = member_group."group_id") - -> Redistribute Motion 3:3 (slice4; segments: 3) - Hash Key: member_subgroup."group_id" + -> Hash Left Join + Hash Cond: (member_subgroup.subgroup_name = (region.county_name)::text) + Join Filter: (member_group."group_id" = ANY ('{12,13,14,15}'::integer[])) + -> Redistribute Motion 3:3 (slice2; segments: 3) + Hash Key: member_subgroup.subgroup_name + -> Hash Join + Hash Cond: (member."group_id" = member_group."group_id") + -> Redistribute Motion 3:3 (slice3; segments: 3) + Hash Key: member."group_id" + -> Hash Join + Hash Cond: (member."group_id" = member_subgroup."group_id") + -> Seq Scan on member + -> Hash + -> Broadcast Motion 3:3 (slice4; segments: 3) -> Seq Scan on member_subgroup - -> Hash - -> Seq Scan on member_group - -> Hash - -> Redistribute Motion 3:3 (slice5; segments: 3) - Hash Key: region.county_name - -> Seq Scan on region + -> Hash + -> Seq Scan on member_group + -> Hash + -> Redistribute Motion 3:3 (slice5; segments: 3) + Hash Key: region.county_name + -> Seq Scan on region Optimizer: Pivotal Optimizer (GPORCA) (23 rows) diff --git a/contrib/pax_storage/src/test/regress/expected/create_index.out b/contrib/pax_storage/src/test/regress/expected/create_index.out index ac4958d56dc..963ce9a6b39 100644 --- a/contrib/pax_storage/src/test/regress/expected/create_index.out +++ b/contrib/pax_storage/src/test/regress/expected/create_index.out @@ -2690,7 +2690,6 @@ COMMIT; -- we keep the `CONCURRENTLY` to make the following commands fail, -- so these commands will not cause deadlock with test create_view, -- like `drop schema xxx cascade;`. --- See more details at https://code.hashdata.xyz/cloudberry/cbdb/-/issues/54 REINDEX TABLE CONCURRENTLY pg_class; -- no catalog relation ERROR: cannot reindex system catalogs concurrently REINDEX INDEX CONCURRENTLY pg_class_oid_index; -- no catalog index diff --git a/contrib/pax_storage/src/test/regress/expected/create_index_optimizer.out b/contrib/pax_storage/src/test/regress/expected/create_index_optimizer.out index 8d0a41352a8..81728a31260 100644 --- a/contrib/pax_storage/src/test/regress/expected/create_index_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/create_index_optimizer.out @@ -2717,7 +2717,6 @@ COMMIT; -- we keep the `CONCURRENTLY` to make the following commands fail, -- so these commands will not cause deadlock with test create_view, -- like `drop schema xxx cascade;`. --- See more details at https://code.hashdata.xyz/cloudberry/cbdb/-/issues/54 REINDEX TABLE CONCURRENTLY pg_class; -- no catalog relation ERROR: cannot reindex system catalogs concurrently REINDEX INDEX CONCURRENTLY pg_class_oid_index; -- no catalog index diff --git a/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out b/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out index 545d14625f4..46479ca2126 100644 --- a/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out +++ b/contrib/pax_storage/src/test/regress/expected/create_table_distpol.out @@ -128,9 +128,7 @@ select distkey from gp_distribution_policy where localoid = 'distpol_person_copy RESET gp_create_table_random_default_distribution; -- Test duplicate distribute keys CREATE TABLE ctas_dup_dk as SELECT distinct age as c1, age as c2 from distpol_person; -ERROR: duplicate DISTRIBUTED BY column 'c1' SELECT distinct age c1, age c2 into ctas_dup_dk_1 from distpol_person; -ERROR: duplicate DISTRIBUTED BY column 'c1' -- -- Test deriving distribution key from the query's distribution in -- CREATE TABLE AS diff --git a/contrib/pax_storage/src/test/regress/expected/gp_dqa_optimizer.out b/contrib/pax_storage/src/test/regress/expected/gp_dqa_optimizer.out index 250ad329a5e..db4d467570b 100644 --- a/contrib/pax_storage/src/test/regress/expected/gp_dqa_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/gp_dqa_optimizer.out @@ -65,20 +65,18 @@ select count(distinct d) from dqa_t1 group by i; (12 rows) explain (costs off) select count(distinct d) from dqa_t1 group by i; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------ Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate + -> GroupAggregate Group Key: i - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: i - -> Partial GroupAggregate - Group Key: i - -> Sort - Sort Key: i, d - -> Seq Scan on dqa_t1 + -> Sort + Sort Key: i + -> Redistribute Motion 3:3 (slice2; segments: 3) + Hash Key: i + -> Seq Scan on dqa_t1 Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0 -(11 rows) +(9 rows) select count(distinct d), sum(distinct d) from dqa_t1 group by i; count | sum @@ -98,20 +96,18 @@ select count(distinct d), sum(distinct d) from dqa_t1 group by i; (12 rows) explain (costs off) select count(distinct d), sum(distinct d) from dqa_t1 group by i; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------ Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate + -> GroupAggregate Group Key: i - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: i - -> Partial GroupAggregate - Group Key: i - -> Sort - Sort Key: i, d - -> Seq Scan on dqa_t1 + -> Sort + Sort Key: i + -> Redistribute Motion 3:3 (slice2; segments: 3) + Hash Key: i + -> Seq Scan on dqa_t1 Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0 -(11 rows) +(9 rows) select count(distinct d), count(distinct dt) from dqa_t1; count | count @@ -1909,20 +1905,18 @@ select count(distinct d) from dqa_t1 group by i; (12 rows) explain (costs off) select count(distinct d) from dqa_t1 group by i; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------ Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate + -> GroupAggregate Group Key: i - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: i - -> Partial GroupAggregate - Group Key: i - -> Sort - Sort Key: i, d - -> Seq Scan on dqa_t1 + -> Sort + Sort Key: i + -> Redistribute Motion 3:3 (slice2; segments: 3) + Hash Key: i + -> Seq Scan on dqa_t1 Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0 -(11 rows) +(9 rows) select count(distinct d), count(distinct c), count(distinct dt) from dqa_t1; count | count | count diff --git a/contrib/pax_storage/src/test/regress/expected/gp_toolkit.out b/contrib/pax_storage/src/test/regress/expected/gp_toolkit.out index 336354081af..745db42283a 100644 --- a/contrib/pax_storage/src/test/regress/expected/gp_toolkit.out +++ b/contrib/pax_storage/src/test/regress/expected/gp_toolkit.out @@ -304,7 +304,7 @@ update pg_statistic set stawidth=2034567890 where starelid = 'wide_width_test':: select btdrelpages, btdexppages from gp_toolkit.gp_bloat_expected_pages where btdrelid='wide_width_test'::regclass; btdrelpages | btdexppages -------------+------------- - 4 | 3104504228 + 1 | 3104504228 (1 row) select * from gp_toolkit.gp_bloat_diag WHERE bdinspname <> 'pg_catalog'; diff --git a/contrib/pax_storage/src/test/regress/expected/gporca.out b/contrib/pax_storage/src/test/regress/expected/gporca.out index 91a33a91bd4..25dc1938bed 100644 --- a/contrib/pax_storage/src/test/regress/expected/gporca.out +++ b/contrib/pax_storage/src/test/regress/expected/gporca.out @@ -10666,7 +10666,7 @@ create table foo(a int, b int) distributed by (a); -- and log_min_duration_statement, they are the most obvious ones. set log_statement='none'; set log_min_duration_statement=-1; -set pax_enable_debug to off; +set pax.enable_debug to off; set client_min_messages='log'; explain select count(*) from foo group by cube(a,b); QUERY PLAN @@ -10687,7 +10687,7 @@ explain select count(*) from foo group by cube(a,b); (9 rows) reset client_min_messages; -reset pax_enable_debug; +reset pax.enable_debug; reset log_statement; reset log_min_duration_statement; -- TVF accepts ANYENUM, ANYELEMENT returns ANYENUM, ANYARRAY @@ -10804,11 +10804,11 @@ explain select * from foo where b in ('1', '2'); set optimizer_enable_ctas = off; set log_statement='none'; set log_min_duration_statement=-1; -set pax_enable_debug to off; +set pax.enable_debug to off; set client_min_messages='log'; create table foo_ctas(a) as (select generate_series(1,10)) distributed by (a); reset client_min_messages; -reset pax_enable_debug; +reset pax.enable_debug; reset log_min_duration_statement; reset log_statement; reset optimizer_enable_ctas; diff --git a/contrib/pax_storage/src/test/regress/expected/gporca_optimizer.out b/contrib/pax_storage/src/test/regress/expected/gporca_optimizer.out index d638a97071b..c6b03a95ae1 100644 --- a/contrib/pax_storage/src/test/regress/expected/gporca_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/gporca_optimizer.out @@ -2396,9 +2396,9 @@ explain select case when bar1.x2 = bar2.x2 then coalesce((select 1 from orca.foo from orca.bar1 inner join orca.bar2 on (bar1.x2 = bar2.x2) order by bar1.x1; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------ - Sort (cost=0.00..1765423.13 rows=20 width=8) + Sort (cost=0.00..1765423.00 rows=20 width=8) Sort Key: bar1.x1 - -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1765423.13 rows=20 width=8) + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1765423.00 rows=20 width=8) -> Hash Join (cost=0.00..862.00 rows=7 width=12) Hash Cond: (bar1.x2 = bar2.x2) -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=7 width=8) @@ -2416,7 +2416,7 @@ from orca.bar1 inner join orca.bar2 on (bar1.x2 = bar2.x2) order by bar1.x1; -> Materialize (cost=0.00..431.00 rows=10 width=4) -> Broadcast Motion 3:3 (slice4; segments: 3) (cost=0.00..431.00 rows=10 width=4) -> Seq Scan on foo (cost=0.00..431.00 rows=4 width=4) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (21 rows) select case when bar1.x2 = bar2.x2 then coalesce((select 1 from orca.foo where bar1.x2 = bar2.x2 and bar1.x2 = random() and foo.x2 = bar2.x2),0) else 1 end as col1, bar1.x1 @@ -9907,15 +9907,15 @@ analyze orca.bm_test; create index bm_test_idx on orca.bm_test using bitmap (i); set optimizer_enable_bitmapscan=on; explain select * from orca.bm_test where i=2 and t='2'; - QUERY PLAN + QUERY PLAN -------------------------------------------------------------------------------- - Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..204.39 rows=2 width=6) - -> Bitmap Heap Scan on bm_test (cost=0.00..204.39 rows=2 width=6) + Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..397.98 rows=2 width=6) + -> Bitmap Heap Scan on bm_test (cost=0.00..397.98 rows=1 width=6) Recheck Cond: (i = 2) Filter: (t = '2'::text) -> Bitmap Index Scan on bm_test_idx (cost=0.00..0.00 rows=0 width=0) Index Cond: (i = 2) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (7 rows) select * from orca.bm_test where i=2 and t='2'; @@ -10050,10 +10050,10 @@ create index bm_multi_test_idx_part on orca.bm_dyn_test_multilvl_part using bitm analyze orca.bm_dyn_test_multilvl_part; -- print name of parent index explain select * from orca.bm_dyn_test_multilvl_part where year = 2019; - QUERY PLAN --------------------------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..7.95 rows=53 width=18) - -> Append (cost=0.00..6.89 rows=18 width=18) + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..5.13 rows=59 width=18) + -> Append (cost=0.00..4.34 rows=20 width=18) -> Seq Scan on bm_dyn_test_multilvl_part_1_prt_2_2_prt_1_3_prt_usa bm_dyn_test_multilvl_part_1 (cost=0.00..1.01 rows=1 width=44) Filter: (year = 2019) -> Seq Scan on bm_dyn_test_multilvl_part_1_prt_2_2_prt_1_3_prt_other_regions bm_dyn_test_multilvl_part_2 (cost=0.00..1.01 rows=1 width=44) @@ -10290,8 +10290,8 @@ WHERE tq.sym = tt.symbol AND tt.event_ts < tq.end_ts GROUP BY 1 ORDER BY 1 asc ; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------- GroupAggregate (cost=0.00..862.00 rows=1 width=16) Group Key: ((((tt.event_ts / 100000) / 5) * 5)) -> Sort (cost=0.00..862.00 rows=1 width=8) @@ -10306,7 +10306,7 @@ ORDER BY 1 asc ; Number of partitions to scan: 2 (out of 2) -> Hash (cost=431.00..431.00 rows=1 width=24) -> Seq Scan on my_tt_agg_opt tt (cost=0.00..431.00 rows=1 width=24) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (15 rows) -- MPP-25661: IndexScan crashing for qual with reference to outer tuple @@ -10349,8 +10349,8 @@ explain select id, comment from idxscan_outer as o join idxscan_inner as i on o. where ordernum between 10 and 20; QUERY PLAN ------------------------------------------------------------------------------------------------------ - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.01 rows=3 width=9) - -> Nested Loop (cost=0.00..1324033.01 rows=1 width=9) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.98 rows=3 width=9) + -> Nested Loop (cost=0.00..1324032.98 rows=1 width=9) Join Filter: (o.id = i.productid) -> Seq Scan on idxscan_outer o (cost=0.00..431.00 rows=3 width=4) -> Materialize (cost=0.00..431.00 rows=1 width=9) @@ -10530,45 +10530,45 @@ select disable_xform('CXformSelect2IndexGet'); EXPLAIN SELECT * FROM btree_test WHERE a in (1, 47); QUERY PLAN ------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..204.38 rows=3 width=8) - -> Bitmap Heap Scan on btree_test (cost=0.00..204.38 rows=1 width=8) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..387.97 rows=3 width=8) + -> Bitmap Heap Scan on btree_test (cost=0.00..387.97 rows=1 width=8) Recheck Cond: (a = ANY ('{1,47}'::integer[])) -> Bitmap Index Scan on btree_test_index (cost=0.00..0.00 rows=0 width=0) Index Cond: (a = ANY ('{1,47}'::integer[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) EXPLAIN SELECT * FROM btree_test WHERE a in ('2', 47); QUERY PLAN ------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..204.38 rows=3 width=8) - -> Bitmap Heap Scan on btree_test (cost=0.00..204.38 rows=1 width=8) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..387.97 rows=3 width=8) + -> Bitmap Heap Scan on btree_test (cost=0.00..387.97 rows=1 width=8) Recheck Cond: (a = ANY ('{2,47}'::integer[])) -> Bitmap Index Scan on btree_test_index (cost=0.00..0.00 rows=0 width=0) Index Cond: (a = ANY ('{2,47}'::integer[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) EXPLAIN SELECT * FROM btree_test WHERE a in ('1', '2'); QUERY PLAN ------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..204.38 rows=3 width=8) - -> Bitmap Heap Scan on btree_test (cost=0.00..204.38 rows=1 width=8) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..387.97 rows=3 width=8) + -> Bitmap Heap Scan on btree_test (cost=0.00..387.97 rows=1 width=8) Recheck Cond: (a = ANY ('{1,2}'::integer[])) -> Bitmap Index Scan on btree_test_index (cost=0.00..0.00 rows=0 width=0) Index Cond: (a = ANY ('{1,2}'::integer[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) EXPLAIN SELECT * FROM btree_test WHERE a in ('1', '2', 47); QUERY PLAN ------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..613.15 rows=4 width=8) - -> Bitmap Heap Scan on btree_test (cost=0.00..613.15 rows=2 width=8) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..387.97 rows=4 width=8) + -> Bitmap Heap Scan on btree_test (cost=0.00..387.97 rows=2 width=8) Recheck Cond: (a = ANY ('{1,2,47}'::integer[])) -> Bitmap Index Scan on btree_test_index (cost=0.00..0.00 rows=0 width=0) Index Cond: (a = ANY ('{1,2,47}'::integer[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) SELECT * FROM btree_test WHERE a in ('1', '2', 47); @@ -10614,56 +10614,56 @@ CREATE INDEX bitmap_index ON bitmap_test USING BITMAP(a); EXPLAIN SELECT * FROM bitmap_test WHERE a in (1); QUERY PLAN --------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..68.82 rows=2 width=4) - -> Bitmap Heap Scan on bitmap_test (cost=0.00..68.82 rows=1 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..391.30 rows=1 width=4) + -> Bitmap Heap Scan on bitmap_test (cost=0.00..391.30 rows=1 width=4) Recheck Cond: (a = 1) -> Bitmap Index Scan on bitmap_index (cost=0.00..0.00 rows=0 width=0) Index Cond: (a = 1) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) EXPLAIN SELECT * FROM bitmap_test WHERE a in (1, 47); QUERY PLAN --------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..204.38 rows=3 width=4) - -> Bitmap Heap Scan on bitmap_test (cost=0.00..204.38 rows=1 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..407.97 rows=3 width=4) + -> Bitmap Heap Scan on bitmap_test (cost=0.00..407.97 rows=1 width=4) Recheck Cond: (a = ANY ('{1,47}'::integer[])) -> Bitmap Index Scan on bitmap_index (cost=0.00..0.00 rows=0 width=0) Index Cond: (a = ANY ('{1,47}'::integer[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) EXPLAIN SELECT * FROM bitmap_test WHERE a in ('2', 47); QUERY PLAN --------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..204.38 rows=3 width=4) - -> Bitmap Heap Scan on bitmap_test (cost=0.00..204.38 rows=1 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..407.97 rows=3 width=4) + -> Bitmap Heap Scan on bitmap_test (cost=0.00..407.97 rows=1 width=4) Recheck Cond: (a = ANY ('{2,47}'::integer[])) -> Bitmap Index Scan on bitmap_index (cost=0.00..0.00 rows=0 width=0) Index Cond: (a = ANY ('{2,47}'::integer[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) EXPLAIN SELECT * FROM bitmap_test WHERE a in ('1', '2'); - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..204.38 rows=3 width=4) - -> Bitmap Heap Scan on bitmap_test (cost=0.00..204.38 rows=1 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..407.97 rows=3 width=4) + -> Bitmap Heap Scan on bitmap_test (cost=0.00..407.97 rows=1 width=4) Recheck Cond: (a = ANY ('{1,2}'::integer[])) -> Bitmap Index Scan on bitmap_index (cost=0.00..0.00 rows=0 width=0) Index Cond: (a = ANY ('{1,2}'::integer[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) EXPLAIN SELECT * FROM bitmap_test WHERE a in ('1', '2', 47); - QUERY PLAN + QUERY PLAN --------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..417.97 rows=4 width=4) -> Bitmap Heap Scan on bitmap_test (cost=0.00..417.97 rows=2 width=4) Recheck Cond: (a = ANY ('{1,2,47}'::integer[])) -> Bitmap Index Scan on bitmap_index (cost=0.00..0.00 rows=0 width=0) Index Cond: (a = ANY ('{1,2,47}'::integer[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (6 rows) -- Test Logging for unsupported features in ORCA @@ -10677,7 +10677,7 @@ create table foo(a int, b int) distributed by (a); -- and log_min_duration_statement, they are the most obvious ones. set log_statement='none'; set log_min_duration_statement=-1; -set pax_enable_debug to off; +set pax.enable_debug to off; set client_min_messages='log'; explain select count(*) from foo group by cube(a,b); QUERY PLAN @@ -10715,7 +10715,7 @@ explain select count(*) from foo group by cube(a,b); (30 rows) reset client_min_messages; -reset pax_enable_debug; +reset pax.enable_debug; reset log_statement; reset log_min_duration_statement; -- TVF accepts ANYENUM, ANYELEMENT returns ANYENUM, ANYARRAY @@ -10825,18 +10825,18 @@ explain select * from foo where b in ('1', '2'); Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=12) -> Seq Scan on foo (cost=0.00..431.00 rows=1 width=12) Filter: ((b)::text = ANY ('{1,2}'::text[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (4 rows) set optimizer_enable_ctas = off; set log_statement='none'; set log_min_duration_statement=-1; -set pax_enable_debug to off; +set pax.enable_debug to off; set client_min_messages='log'; create table foo_ctas(a) as (select generate_series(1,10)) distributed by (a); LOG: 2023-08-17 15:11:09:454388 PDT,THD000,NOTICE,"Falling back to Postgres-based planner because GPORCA does not support the following feature: CTAS. Set optimizer_enable_ctas to on to enable CTAS with GPORCA", reset client_min_messages; -reset pax_enable_debug; +reset pax.enable_debug; reset log_min_duration_statement; reset log_statement; reset optimizer_enable_ctas; @@ -10855,8 +10855,8 @@ set optimizer_force_multistage_agg = off; set optimizer_force_three_stage_scalar_dqa = off; -- end_ignore explain (costs off) select count(*), t2.c from input_tab1 t1 left join input_tab2 t2 on t1.a = t2.c group by t2.c; - QUERY PLAN --------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------ Gather Motion 3:1 (slice1; segments: 3) -> GroupAggregate Group Key: t2.c @@ -10869,7 +10869,7 @@ explain (costs off) select count(*), t2.c from input_tab1 t1 left join input_tab -> Seq Scan on input_tab1 t1 -> Hash -> Seq Scan on input_tab2 t2 - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (13 rows) select count(*), t2.c from input_tab1 t1 left join input_tab2 t2 on t1.a = t2.c group by t2.c; @@ -10923,7 +10923,7 @@ FROM (SELECT * -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=7) -> Seq Scan on tab_2 (cost=0.00..431.00 rows=1 width=7) -> Seq Scan on tab_3 (cost=0.00..431.00 rows=1 width=8) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (11 rows) SELECT Count(*) @@ -11158,15 +11158,15 @@ INSERT INTO csq_cast_param_inner VALUES EXPLAIN SELECT a FROM csq_cast_param_outer WHERE b in (SELECT CASE WHEN a > 1 THEN d ELSE '42' END FROM csq_cast_param_inner); QUERY PLAN ----------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.56 rows=2 width=4) - -> Seq Scan on csq_cast_param_outer (cost=0.00..1324032.56 rows=1 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.38 rows=1 width=4) + -> Seq Scan on csq_cast_param_outer (cost=0.00..1324032.38 rows=1 width=4) Filter: (SubPlan 1) SubPlan 1 - -> Result (cost=0.00..431.00 rows=2 width=4) - -> Materialize (cost=0.00..431.00 rows=2 width=4) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=2 width=4) + -> Result (cost=0.00..431.00 rows=1 width=4) + -> Materialize (cost=0.00..431.00 rows=1 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=4) -> Seq Scan on csq_cast_param_inner (cost=0.00..431.00 rows=1 width=4) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (9 rows) SELECT a FROM csq_cast_param_outer WHERE b in (SELECT CASE WHEN a > 1 THEN d ELSE '42' END FROM csq_cast_param_inner); @@ -11182,15 +11182,15 @@ CREATE CAST (myint AS numeric) WITH FUNCTION myint_numeric(myint) AS IMPLICIT; EXPLAIN SELECT a FROM csq_cast_param_outer WHERE b in (SELECT CASE WHEN a > 1 THEN d ELSE '42' END FROM csq_cast_param_inner); QUERY PLAN ----------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.56 rows=2 width=4) - -> Seq Scan on csq_cast_param_outer (cost=0.00..1324032.56 rows=1 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.38 rows=1 width=4) + -> Seq Scan on csq_cast_param_outer (cost=0.00..1324032.38 rows=1 width=4) Filter: (SubPlan 1) SubPlan 1 - -> Result (cost=0.00..431.00 rows=2 width=4) - -> Materialize (cost=0.00..431.00 rows=2 width=4) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=2 width=4) + -> Result (cost=0.00..431.00 rows=1 width=4) + -> Materialize (cost=0.00..431.00 rows=1 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=4) -> Seq Scan on csq_cast_param_inner (cost=0.00..431.00 rows=1 width=4) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (9 rows) SELECT a FROM csq_cast_param_outer WHERE b in (SELECT CASE WHEN a > 1 THEN d ELSE '42' END FROM csq_cast_param_inner); @@ -11211,19 +11211,19 @@ SELECT a FROM ggg WHERE a IN (NULL, 'x'); EXPLAIN SELECT a FROM ggg WHERE a NOT IN (NULL, ''); QUERY PLAN ------------------------------------------------------------------------------ - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=2) - -> Seq Scan on ggg (cost=0.00..431.00 rows=1 width=2) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=8) + -> Seq Scan on ggg (cost=0.00..431.00 rows=1 width=8) Filter: (a <> ALL ('{NULL,""}'::bpchar[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (4 rows) EXPLAIN SELECT a FROM ggg WHERE a IN (NULL, 'x'); QUERY PLAN ------------------------------------------------------------------------------ - Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..431.00 rows=1 width=2) - -> Seq Scan on ggg (cost=0.00..431.00 rows=1 width=2) + Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..431.00 rows=1 width=8) + -> Seq Scan on ggg (cost=0.00..431.00 rows=1 width=8) Filter: (a = ANY ('{NULL,x}'::bpchar[])) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (4 rows) -- result node with one time filter and filter @@ -11280,10 +11280,9 @@ EXPLAIN (COSTS OFF) WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM Filter: (share0_ref3.b = f1.b) -> Materialize -> Broadcast Motion 3:3 (slice6; segments: 3) - -> Result - -> Shared Scan (share slice:id 6:0) + -> Shared Scan (share slice:id 6:0) Optimizer: GPORCA -(43 rows) +(42 rows) WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM onetimefilter1, onetimefilter2 WHERE onetimefilter1.a=onetimefilter2.a) SELECT (SELECT 1 FROM abc WHERE f1.b = f2.b LIMIT 1), COALESCE((SELECT 2 FROM abc WHERE f1.a=random() AND f1.a=2), 0), (SELECT b FROM abc WHERE b=f1.b) FROM onetimefilter1 f1, onetimefilter2 f2 WHERE f1.b = f2.b; ?column? | coalesce | b @@ -11409,28 +11408,39 @@ SELECT * FROM ds_part, non_part2 WHERE ds_part.c = non_part2.e AND non_part2.f = (0 rows) explain analyze SELECT * FROM ds_part, non_part2 WHERE ds_part.c = non_part2.e AND non_part2.f = 10 AND a IN ( SELECT b + 1 FROM non_part1); - QUERY PLAN --------------------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324481.18 rows=1 width=20) - -> Hash Join (cost=0.00..1324481.18 rows=1 width=20) + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324480.95 rows=1 width=20) (actual time=4.000..4.000 rows=0 loops=1) + -> Hash Join (cost=0.00..1324480.95 rows=1 width=20) (actual time=4.000..4.000 rows=0 loops=1) Hash Cond: (ds_part.c = non_part2.e) - -> Dynamic Seq Scan on ds_part (cost=0.00..1324050.11 rows=334 width=12) + -> Dynamic Seq Scan on ds_part (cost=0.00..1324049.89 rows=334 width=12) (actual time=0.000..0.000 rows=0 loops=1) Number of partitions to scan: 6 (out of 6) Filter: ((a = (b + 1)) AND (SubPlan 1)) + Partitions scanned: Avg 1.0 x 3 workers. Max 1 parts (seg0). SubPlan 1 - -> Materialize (cost=0.00..431.00 rows=1 width=4) - -> Broadcast Motion 1:3 (slice2) (cost=0.00..431.00 rows=1 width=4) - -> Limit (cost=0.00..431.00 rows=1 width=4) - -> Gather Motion 3:1 (slice3; segments: 3) (cost=0.00..431.00 rows=1 width=4) - -> Limit (cost=0.00..431.00 rows=1 width=4) - -> Seq Scan on non_part1 (cost=0.00..431.00 rows=34 width=4) - -> Hash (cost=431.00..431.00 rows=1 width=8) - -> Partition Selector (selector id: $0) (cost=0.00..431.00 rows=1 width=8) - -> Broadcast Motion 3:3 (slice4; segments: 3) (cost=0.00..431.00 rows=1 width=8) - -> Seq Scan on non_part2 (cost=0.00..431.00 rows=1 width=8) + -> Materialize (cost=0.00..431.00 rows=1 width=4) (never executed) + -> Broadcast Motion 1:3 (slice2) (cost=0.00..431.00 rows=1 width=4) (never executed) + -> Limit (cost=0.00..431.00 rows=1 width=4) (actual time=4.000..4.000 rows=1 loops=1) + -> Gather Motion 3:1 (slice3; segments: 3) (cost=0.00..431.00 rows=1 width=4) (actual time=4.000..4.000 rows=1 loops=1) + -> Limit (cost=0.00..431.00 rows=1 width=4) (actual time=0.000..0.000 rows=1 loops=1) + -> Seq Scan on non_part1 (cost=0.00..431.00 rows=34 width=4) (actual time=0.000..0.000 rows=1 loops=1) + -> Hash (cost=431.00..431.00 rows=1 width=8) (actual time=4.000..4.000 rows=1 loops=1) + Buckets: 262144 Batches: 1 Memory Usage: 2049kB + -> Partition Selector (selector id: $0) (cost=0.00..431.00 rows=1 width=8) (actual time=4.000..4.000 rows=1 loops=1) + -> Broadcast Motion 3:3 (slice4; segments: 3) (cost=0.00..431.00 rows=1 width=8) (actual time=4.000..4.000 rows=1 loops=1) + -> Seq Scan on non_part2 (cost=0.00..431.00 rows=1 width=8) (actual time=0.000..0.000 rows=1 loops=1) Filter: (f = 10) - Optimizer: Pivotal Optimizer (GPORCA) -(19 rows) + Rows Removed by Filter: 24 + Planning Time: 24.393 ms + (slice0) Executor memory: 81K bytes. + (slice1) Executor memory: 2231K bytes avg x 3x(0) workers, 2231K bytes max (seg0). Work_mem: 2049K bytes max. + (slice2) Executor memory: 20K bytes (entry db). + (slice3) Executor memory: 118K bytes avg x 3x(0) workers, 118K bytes max (seg0). + (slice4) Executor memory: 115K bytes avg x 3x(0) workers, 115K bytes max (seg0). + Memory used: 128000kB + Optimizer: GPORCA + Execution Time: 2.262 ms +(30 rows) SELECT *, a IN ( SELECT b + 1 FROM non_part1) FROM ds_part, non_part2 WHERE ds_part.c = non_part2.e AND non_part2.f = 10 AND a IN ( SELECT b FROM non_part1); a | b | c | e | f | ?column? @@ -11620,12 +11630,12 @@ update gp_distribution_policy set numsegments = numsegments-1 where localoid = ' reset allow_system_table_mods; -- populate the tables on this smaller cluster explain insert into gpexp_hash select i, i from generate_series(1,50) i; - QUERY PLAN --------------------------------------------------------------------------------------------- - Insert on gpexp_hash (cost=0.00..30.00 rows=500 width=8) - -> Redistribute Motion 1:2 (slice1; segments: 1) (cost=0.00..30.00 rows=1000 width=8) + QUERY PLAN +----------------------------------------------------------------------------------------- + Insert on gpexp_hash (cost=0.00..1.25 rows=0 width=0) + -> Redistribute Motion 1:2 (slice1; segments: 1) (cost=0.00..1.25 rows=25 width=8) Hash Key: i.i - -> Function Scan on generate_series i (cost=0.00..10.00 rows=500 width=8) + -> Function Scan on generate_series i (cost=0.00..0.50 rows=50 width=8) Optimizer: Postgres query optimizer (5 rows) @@ -11693,14 +11703,14 @@ select b, count(*) from gpexp_hash group by b order by b; explain update gpexp_rand set b=(select b from gpexp_hash where gpexp_rand.a = gpexp_hash.a); QUERY PLAN ---------------------------------------------------------------------------------------------------------- - Update on gpexp_rand (cost=0.00..216.00 rows=25 width=18) - -> Seq Scan on gpexp_rand (cost=0.00..215.00 rows=25 width=18) + Update on gpexp_rand (cost=0.00..70.00 rows=0 width=0) + -> Seq Scan on gpexp_rand (cost=0.00..70.00 rows=25 width=46) SubPlan 1 - -> Result (cost=0.00..4.25 rows=50 width=4) + -> Result (cost=0.00..2.75 rows=50 width=4) Filter: (gpexp_rand.a = gpexp_hash.a) - -> Materialize (cost=0.00..3.75 rows=50 width=8) - -> Broadcast Motion 2:2 (slice1; segments: 2) (cost=0.00..3.50 rows=25 width=8) - -> Seq Scan on gpexp_hash (cost=0.00..2.50 rows=25 width=8) + -> Materialize (cost=0.00..2.25 rows=50 width=8) + -> Broadcast Motion 2:2 (slice1; segments: 2) (cost=0.00..2.00 rows=50 width=8) + -> Seq Scan on gpexp_hash (cost=0.00..1.25 rows=25 width=8) Optimizer: Postgres query optimizer (9 rows) @@ -11726,7 +11736,7 @@ delete from gpexp_repl where b >= 20; explain insert into gpexp_repl values (20, 20); QUERY PLAN -------------------------------------------------------- - Insert on gpexp_repl (cost=0.00..0.01 rows=1 width=8) + Insert on gpexp_repl (cost=0.00..0.01 rows=0 width=0) -> Result (cost=0.00..0.01 rows=1 width=8) Optimizer: Postgres query optimizer (3 rows) @@ -11792,21 +11802,21 @@ analyze part2_1_prt_2; -- the plan should contain a 2 stage limit. If we incorrectly estimate that the -- relation is empty, we would end up choosing a single stage limit. explain select * from part1, part2 where part1.b = part2.b limit 5; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------ - Limit (cost=0.00..862.14 rows=5 width=16) + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------- + Limit (cost=0.00..862.13 rows=5 width=16) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.13 rows=5 width=16) -> Limit (cost=0.00..862.13 rows=2 width=16) - -> Hash Join (cost=0.00..862.13 rows=334 width=16) + -> Hash Join (cost=0.00..862.13 rows=333 width=16) Hash Cond: (part1.b = part2.b) - -> Dynamic Seq Scan on part1 (cost=0.00..431.01 rows=334 width=8) + -> Dynamic Seq Scan on part1 (cost=0.00..431.01 rows=333 width=8) Number of partitions to scan: 4 (out of 4) - -> Hash (cost=431.02..431.02 rows=100 width=8) - -> Partition Selector (selector id: $0) (cost=0.00..431.02 rows=100 width=8) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.02 rows=100 width=8) - -> Dynamic Seq Scan on part2 (cost=0.00..431.00 rows=34 width=8) + -> Hash (cost=431.02..431.02 rows=97 width=8) + -> Partition Selector (selector id: $0) (cost=0.00..431.02 rows=97 width=8) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.02 rows=97 width=8) + -> Dynamic Seq Scan on part2 (cost=0.00..431.00 rows=33 width=8) Number of partitions to scan: 4 (out of 4) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (13 rows) -- test opfamily handling in ORCA @@ -11858,14 +11868,14 @@ ANALYZE btab_old_hash; EXPLAIN SELECT a, b FROM atab_old_hash INNER JOIN btab_old_hash ON a |=| b; QUERY PLAN --------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..1324032.87 rows=5 width=8) - -> Nested Loop (cost=0.00..1324032.87 rows=2 width=8) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.86 rows=5 width=8) + -> Nested Loop (cost=0.00..1324032.86 rows=2 width=8) Join Filter: (atab_old_hash.a |=| btab_old_hash.b) -> Seq Scan on btab_old_hash (cost=0.00..431.00 rows=2 width=4) -> Materialize (cost=0.00..431.00 rows=3 width=4) - -> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..431.00 rows=3 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=3 width=4) -> Seq Scan on atab_old_hash (cost=0.00..431.00 rows=1 width=4) - Optimizer: Pivotal Optimizer (GPORCA) version 3.93.0 + Optimizer: GPORCA (8 rows) SELECT a, b FROM atab_old_hash INNER JOIN btab_old_hash ON a |=| b; @@ -11951,20 +11961,37 @@ select disable_xform('CXformFullOuterJoin2HashJoin'); -- fallback reason: Invalid system target list found for AO table EXPLAIN SELECT a, b FROM atab_old_hash FULL JOIN btab_old_hash ON a |=| b; - QUERY PLAN ----------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=1.06..2.31 rows=6 width=8) - -> Hash Full Join (cost=1.06..2.23 rows=2 width=8) - Hash Cond: (atab_old_hash.a |=| btab_old_hash.b) - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..1.03 rows=1 width=4) - Hash Key: atab_old_hash.a - -> Seq Scan on atab_old_hash (cost=0.00..1.01 rows=1 width=4) - -> Hash (cost=1.04..1.04 rows=1 width=4) - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..1.04 rows=1 width=4) - Hash Key: btab_old_hash.b - -> Seq Scan on btab_old_hash (cost=0.00..1.01 rows=1 width=4) - Optimizer: Postgres query optimizer -(11 rows) + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..2586.00 rows=10 width=8) + -> Result (cost=0.00..2586.00 rows=4 width=8) + -> Sequence (cost=0.00..2586.00 rows=4 width=8) + -> Shared Scan (share slice:id 1:2) (cost=0.00..431.00 rows=2 width=1) + -> Seq Scan on btab_old_hash (cost=0.00..431.00 rows=2 width=22) + -> Sequence (cost=0.00..2155.00 rows=4 width=8) + -> Shared Scan (share slice:id 1:3) (cost=0.00..431.00 rows=1 width=1) + -> Seq Scan on atab_old_hash (cost=0.00..431.00 rows=1 width=22) + -> Append (cost=0.00..1724.00 rows=4 width=8) + -> Hash Left Join (cost=0.00..862.00 rows=3 width=8) + Hash Cond: (share2_ref2.b |=| share3_ref2.a) + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=2 width=4) + Hash Key: share2_ref2.b + -> Shared Scan (share slice:id 2:2) (cost=0.00..431.00 rows=2 width=4) + -> Hash (cost=431.00..431.00 rows=1 width=4) + -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.00 rows=1 width=4) + Hash Key: share3_ref2.a + -> Shared Scan (share slice:id 3:3) (cost=0.00..431.00 rows=1 width=4) + -> Hash Anti Join (cost=0.00..862.00 rows=1 width=4) + Hash Cond: (share3_ref3.a |=| share2_ref3.b) + -> Redistribute Motion 3:3 (slice4; segments: 3) (cost=0.00..431.00 rows=1 width=4) + Hash Key: share3_ref3.a + -> Shared Scan (share slice:id 4:3) (cost=0.00..431.00 rows=1 width=4) + -> Hash (cost=431.00..431.00 rows=2 width=4) + -> Redistribute Motion 3:3 (slice5; segments: 3) (cost=0.00..431.00 rows=2 width=4) + Hash Key: share2_ref3.b + -> Shared Scan (share slice:id 5:2) (cost=0.00..431.00 rows=2 width=4) + Optimizer: GPORCA +(28 rows) SELECT a, b FROM atab_old_hash FULL JOIN btab_old_hash ON a |=| b; a | b @@ -12020,9 +12047,9 @@ select disable_xform('CXformImplementInnerJoin'); EXPLAIN SELECT 1 FROM foo1 left join foo2 on foo1.a = foo2.a AND foo2.c = 3 AND foo2.b IN (SELECT b FROM foo3); QUERY PLAN --------------------------------------------------------------------------------------------------------------------- - Result (cost=0.00..1721310.84 rows=2 width=4) - -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1721310.84 rows=2 width=1) - -> Nested Loop Left Join (cost=0.00..1721310.84 rows=1 width=1) + Result (cost=0.00..1721310.83 rows=2 width=4) + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1721310.83 rows=2 width=1) + -> Nested Loop Left Join (cost=0.00..1721310.83 rows=1 width=1) Join Filter: (foo1.a = foo2.a) -> Seq Scan on foo1 (cost=0.00..431.00 rows=1 width=4) -> Materialize (cost=0.00..818.97 rows=1 width=4) @@ -12035,7 +12062,7 @@ EXPLAIN SELECT 1 FROM foo1 left join foo2 on foo1.a = foo2.a AND foo2.c = 3 AND -> Hash (cost=431.00..431.00 rows=2 width=4) -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=2 width=4) -> Seq Scan on foo3 (cost=0.00..431.00 rows=1 width=4) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (16 rows) SELECT 1 FROM foo1 left join foo2 on foo1.a = foo2.a AND foo2.c = 3 AND foo2.b IN (SELECT b FROM foo3); @@ -12079,8 +12106,8 @@ SELECT DISTINCT L1.c, L1.lid FROM t55 L1 CROSS JOIN META WHERE L1.lid = int4in(textout(meta.load_id)); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry. - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------- Result (cost=0.00..431.09 rows=1 width=8) Output: c, lid -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.07 rows=1 width=8) @@ -12094,15 +12121,15 @@ NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entr -> Hash Join (cost=0.00..431.07 rows=1 width=8) Output: c, lid Hash Cond: (l1.lid = int4in(textout(('99'::text)))) - -> Seq Scan on orca.t55 l1 (cost=0.00..431.01 rows=334 width=8) + -> Seq Scan on orca.t55 l1 (cost=0.00..431.00 rows=334 width=8) Output: c, lid -> Hash (cost=0.00..0.00 rows=1 width=8) Output: ('2020-01-01'::text), ('99'::text) -> Result (cost=0.00..0.00 rows=1 width=1) Output: '2020-01-01'::text, '99'::text - Optimizer: Pivotal Optimizer (GPORCA) - Settings: optimizer_enable_coordinator_only_queries = 'on', optimizer_enable_master_only_queries = 'on', optimizer_join_order = 'query', optimizer_segments = '3' -(25 rows) + Settings: enable_incremental_sort = 'on', optimizer = 'on', optimizer_enable_dynamicbitmapscan = 'on', optimizer_join_order = 'query' + Optimizer: GPORCA +(21 rows) CREATE TABLE TP AS WITH META AS (SELECT '2020-01-01' AS VALID_DT, '99' AS LOAD_ID) @@ -12132,8 +12159,8 @@ explain select * from lossycastrangepart where b::int = 10; Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=16) -> Dynamic Seq Scan on lossycastrangepart (cost=0.00..431.00 rows=1 width=16) Number of partitions to scan: 4 (out of 4) - Filter: (int4(b) = 10) - Optimizer: Pivotal Optimizer (GPORCA) + Filter: ((b)::integer = 10) + Optimizer: GPORCA (5 rows) select * from lossycastrangepart where b::int = 10; @@ -12149,8 +12176,8 @@ explain select * from lossycastrangepart where b::int = 11; Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=16) -> Dynamic Seq Scan on lossycastrangepart (cost=0.00..431.00 rows=1 width=16) Number of partitions to scan: 4 (out of 4) - Filter: (int4(b) = 11) - Optimizer: Pivotal Optimizer (GPORCA) + Filter: ((b)::integer = 11) + Optimizer: GPORCA (5 rows) select * from lossycastrangepart where b::int = 11; @@ -12166,8 +12193,8 @@ explain select * from lossycastrangepart where b::int < 10; Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=16) -> Dynamic Seq Scan on lossycastrangepart (cost=0.00..431.00 rows=1 width=16) Number of partitions to scan: 4 (out of 4) - Filter: (int4(b) < 10) - Optimizer: Pivotal Optimizer (GPORCA) + Filter: ((b)::integer < 10) + Optimizer: GPORCA (5 rows) select * from lossycastrangepart where b::int < 10; @@ -12337,12 +12364,12 @@ where out.b in (select coalesce(tcorr2.a, 99) from tcorr1 left outer join tcorr2 on tcorr1.a=tcorr2.a+out.a); QUERY PLAN -------------------------------------------------------------------------------------------------- - Result (cost=0.00..1356692031.36 rows=1 width=8) + Result (cost=0.00..1356692012.89 rows=1 width=8) Filter: (SubPlan 1) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=8) -> Seq Scan on tcorr1 "out" (cost=0.00..431.00 rows=1 width=8) SubPlan 1 - -> Nested Loop Left Join (cost=0.00..1324032.56 rows=2 width=4) + -> Nested Loop Left Join (cost=0.00..1324032.54 rows=2 width=4) Join Filter: (tcorr1.a = (tcorr2.a + "out".a)) -> Materialize (cost=0.00..431.00 rows=1 width=4) -> Gather Motion 3:1 (slice3; segments: 3) (cost=0.00..431.00 rows=1 width=4) @@ -12371,7 +12398,7 @@ where out.b in (select max(tcorr2.b + out.b - 1) where tcorr2.a=out.a); QUERY PLAN -------------------------------------------------------------------------------------------------------- - Result (cost=0.00..1324032.63 rows=1 width=8) + Result (cost=0.00..1324032.61 rows=1 width=8) Filter: (SubPlan 1) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=8) -> Seq Scan on tcorr1 "out" (cost=0.00..431.00 rows=1 width=8) @@ -12406,12 +12433,12 @@ where out.b in (select coalesce(tcorr2_d.c, 99) group by a) tcorr2_d on tcorr1.a=tcorr2_d.a); QUERY PLAN -------------------------------------------------------------------------------------------------------------------- - Result (cost=0.00..1356692228.44 rows=1 width=8) + Result (cost=0.00..1356692209.98 rows=1 width=8) Filter: (SubPlan 1) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=8) -> Seq Scan on tcorr1 "out" (cost=0.00..431.00 rows=1 width=8) SubPlan 1 - -> Nested Loop Left Join (cost=0.00..1324032.75 rows=3 width=8) + -> Nested Loop Left Join (cost=0.00..1324032.74 rows=3 width=8) Join Filter: (tcorr1.a = tcorr2.a) -> Materialize (cost=0.00..431.00 rows=1 width=4) -> Gather Motion 3:1 (slice3; segments: 3) (cost=0.00..431.00 rows=1 width=4) @@ -12425,7 +12452,7 @@ where out.b in (select coalesce(tcorr2_d.c, 99) -> Materialize (cost=0.00..431.00 rows=1 width=8) -> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=8) -> Seq Scan on tcorr2 (cost=0.00..431.00 rows=1 width=8) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (20 rows) -- expect 1 row @@ -12459,12 +12486,12 @@ where out.b in (select coalesce(tcorr2.a, 99) from tcorr1 left outer join tcorr2 on tcorr1.a=tcorr2.a+out.a); QUERY PLAN -------------------------------------------------------------------------------------------------- - Result (cost=0.00..1356692031.36 rows=1 width=8) + Result (cost=0.00..1356692012.89 rows=1 width=8) Filter: (SubPlan 1) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=8) -> Seq Scan on tcorr1 "out" (cost=0.00..431.00 rows=1 width=8) SubPlan 1 - -> Nested Loop Left Join (cost=0.00..1324032.56 rows=2 width=4) + -> Nested Loop Left Join (cost=0.00..1324032.54 rows=2 width=4) Join Filter: (tcorr1.a = (tcorr2.a + "out".a)) -> Materialize (cost=0.00..431.00 rows=1 width=4) -> Gather Motion 3:1 (slice3; segments: 3) (cost=0.00..431.00 rows=1 width=4) @@ -12472,7 +12499,7 @@ where out.b in (select coalesce(tcorr2.a, 99) -> Materialize (cost=0.00..431.00 rows=1 width=4) -> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=4) -> Seq Scan on tcorr2 (cost=0.00..431.00 rows=1 width=4) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (14 rows) -- expect 1 row @@ -12493,7 +12520,7 @@ where out.b in (select max(tcorr2.b + out.b - 1) where tcorr2.a=out.a); QUERY PLAN -------------------------------------------------------------------------------------------------------- - Result (cost=0.00..1324032.63 rows=1 width=8) + Result (cost=0.00..1324032.61 rows=1 width=8) Filter: (SubPlan 1) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=8) -> Seq Scan on tcorr1 "out" (cost=0.00..431.00 rows=1 width=8) @@ -12528,12 +12555,12 @@ where out.b in (select coalesce(tcorr2_d.c, 99) group by a) tcorr2_d on tcorr1.a=tcorr2_d.a); QUERY PLAN -------------------------------------------------------------------------------------------------------------------- - Result (cost=0.00..1356692228.44 rows=1 width=8) + Result (cost=0.00..1356692209.98 rows=1 width=8) Filter: (SubPlan 1) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=8) -> Seq Scan on tcorr1 "out" (cost=0.00..431.00 rows=1 width=8) SubPlan 1 - -> Nested Loop Left Join (cost=0.00..1324032.75 rows=3 width=8) + -> Nested Loop Left Join (cost=0.00..1324032.74 rows=3 width=8) Join Filter: (tcorr1.a = tcorr2.a) -> Materialize (cost=0.00..431.00 rows=1 width=4) -> Gather Motion 3:1 (slice3; segments: 3) (cost=0.00..431.00 rows=1 width=4) @@ -12547,7 +12574,7 @@ where out.b in (select coalesce(tcorr2_d.c, 99) -> Materialize (cost=0.00..431.00 rows=1 width=8) -> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=8) -> Seq Scan on tcorr2 (cost=0.00..431.00 rows=1 width=8) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (20 rows) -- expect 1 row @@ -13739,8 +13766,8 @@ set enable_hashjoin=off; explain select r.a, r.b, r.c, l.c from left_outer_index_nl_foo r left outer join left_outer_index_nl_bar l on r.b=l.b; QUERY PLAN --------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.80 rows=7 width=16) - -> Nested Loop Left Join (cost=0.00..1324033.80 rows=3 width=16) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.78 rows=7 width=16) + -> Nested Loop Left Join (cost=0.00..1324033.78 rows=3 width=16) Join Filter: (r.b = l.b) -> Seq Scan on left_outer_index_nl_foo r (cost=0.00..431.00 rows=2 width=12) -> Materialize (cost=0.00..431.00 rows=5 width=8) @@ -13773,8 +13800,8 @@ analyze left_outer_index_nl_bar_hash; explain select r.a, r.b, r.c, l.c from left_outer_index_nl_foo_hash r left outer join left_outer_index_nl_bar l on r.b=l.b; QUERY PLAN --------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.77 rows=7 width=14) - -> Nested Loop Left Join (cost=0.00..1324033.77 rows=3 width=14) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.75 rows=7 width=14) + -> Nested Loop Left Join (cost=0.00..1324033.75 rows=3 width=14) Join Filter: (r.b = l.b) -> Seq Scan on left_outer_index_nl_foo_hash r (cost=0.00..431.00 rows=2 width=10) -> Materialize (cost=0.00..431.00 rows=5 width=8) @@ -13794,10 +13821,10 @@ select r.a, r.b, r.c, l.c from left_outer_index_nl_foo_hash r left outer join le --- verify that a motion is introduced such that joins on each segment are internal to that segment (distributed by join key) explain select r.a, r.b, r.c, l.c from left_outer_index_nl_foo_hash r left outer join left_outer_index_nl_bar_hash l on r.b=l.b; - QUERY PLAN ------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.54 rows=7 width=12) - -> Nested Loop Left Join (cost=0.00..1324033.54 rows=3 width=12) + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.52 rows=7 width=12) + -> Nested Loop Left Join (cost=0.00..1324033.52 rows=3 width=12) Join Filter: (r.b = l.b) -> Seq Scan on left_outer_index_nl_foo_hash r (cost=0.00..431.00 rows=2 width=10) -> Materialize (cost=0.00..431.00 rows=5 width=6) @@ -13826,8 +13853,8 @@ analyze left_outer_index_nl_bar_repl; explain select r.a, r.b, r.c, l.c from left_outer_index_nl_foo_repl r left outer join left_outer_index_nl_bar_repl l on r.b=l.b; QUERY PLAN ---------------------------------------------------------------------------------------------- - Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..1324035.09 rows=7 width=16) - -> Nested Loop Left Join (cost=0.00..1324035.09 rows=21 width=16) + Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..1324035.01 rows=7 width=16) + -> Nested Loop Left Join (cost=0.00..1324035.01 rows=21 width=16) Join Filter: (r.b = l.b) -> Seq Scan on left_outer_index_nl_foo_repl r (cost=0.00..431.00 rows=12 width=12) -> Seq Scan on left_outer_index_nl_bar_repl l (cost=0.00..431.00 rows=15 width=8) @@ -13845,10 +13872,10 @@ select r.a, r.b, r.c, l.c from left_outer_index_nl_foo_repl r left outer join le --- outer side replicated, inner side hashed can have interesting cases (gather + join on one segment of inner side and redistribute + join + gather are both valid) explain select r.a, r.b, r.c, l.c from left_outer_index_nl_foo_repl r left outer join left_outer_index_nl_bar_hash l on r.b=l.b; - QUERY PLAN ------------------------------------------------------------------------------------------------------- - Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..1324035.41 rows=7 width=14) - -> Nested Loop Left Join (cost=0.00..1324035.41 rows=21 width=14) + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Gather Motion 1:1 (slice1; segments: 1) (cost=0.00..1324035.37 rows=7 width=14) + -> Nested Loop Left Join (cost=0.00..1324035.37 rows=21 width=14) Join Filter: (r.b = l.b) -> Seq Scan on left_outer_index_nl_foo_repl r (cost=0.00..431.00 rows=12 width=12) -> Materialize (cost=0.00..431.00 rows=15 width=6) @@ -13884,8 +13911,8 @@ ANALYZE tt2; EXPLAIN SELECT b FROM tt1 WHERE NOT EXISTS (SELECT * FROM tt2 WHERE (tt2.d = tt1.b) IS DISTINCT FROM false); QUERY PLAN --------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.03 rows=4 width=4) - -> Nested Loop Anti Join (cost=0.00..1324033.03 rows=2 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.01 rows=4 width=4) + -> Nested Loop Anti Join (cost=0.00..1324033.01 rows=2 width=4) Join Filter: ((tt2.d = tt1.b) IS DISTINCT FROM false) -> Seq Scan on tt1 (cost=0.00..431.00 rows=2 width=4) -> Materialize (cost=0.00..431.00 rows=4 width=4) @@ -13902,8 +13929,8 @@ SELECT b FROM tt1 WHERE NOT EXISTS (SELECT * FROM tt2 WHERE (tt2.d = tt1.b) IS D EXPLAIN SELECT b FROM tt1 WHERE NOT EXISTS (SELECT * FROM tt2 WHERE (tt2.d = tt1.b) IS DISTINCT FROM true); QUERY PLAN --------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.03 rows=4 width=4) - -> Nested Loop Anti Join (cost=0.00..1324033.03 rows=2 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.01 rows=4 width=4) + -> Nested Loop Anti Join (cost=0.00..1324033.01 rows=2 width=4) Join Filter: ((tt2.d = tt1.b) IS DISTINCT FROM true) -> Seq Scan on tt1 (cost=0.00..431.00 rows=2 width=4) -> Materialize (cost=0.00..431.00 rows=4 width=4) @@ -13920,8 +13947,8 @@ SELECT b FROM tt1 WHERE NOT EXISTS (SELECT * FROM tt2 WHERE (tt2.d = tt1.b) IS D EXPLAIN SELECT b FROM tt1 WHERE NOT EXISTS (SELECT * FROM tt2 WHERE (tt1.b = tt2.d) IS DISTINCT FROM NULL); QUERY PLAN --------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.03 rows=4 width=4) - -> Nested Loop Anti Join (cost=0.00..1324033.03 rows=2 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324033.01 rows=4 width=4) + -> Nested Loop Anti Join (cost=0.00..1324033.01 rows=2 width=4) Join Filter: (NOT ((tt1.b = tt2.d) IS NULL)) -> Seq Scan on tt1 (cost=0.00..431.00 rows=2 width=4) -> Materialize (cost=0.00..431.00 rows=4 width=4) @@ -14754,8 +14781,8 @@ analyze ts_tbl; explain select * from ts_tbl where ts = to_timestamp('99991231'::text, 'YYYYMMDD'::text); QUERY PLAN ------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..6.01 rows=40 width=8) - -> Index Only Scan using ts_tbl_idx on ts_tbl (cost=0.00..6.00 rows=14 width=8) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..6.01 rows=1 width=8) + -> Index Only Scan using ts_tbl_idx on ts_tbl (cost=0.00..6.01 rows=1 width=8) Index Cond: (ts = 'Fri Dec 31 00:00:00 9999 PST'::timestamp with time zone) Optimizer: Pivotal Optimizer (GPORCA) (4 rows) diff --git a/contrib/pax_storage/src/test/regress/expected/groupingsets.out b/contrib/pax_storage/src/test/regress/expected/groupingsets.out index 5dd8bb8b53e..25a26d506f8 100644 --- a/contrib/pax_storage/src/test/regress/expected/groupingsets.out +++ b/contrib/pax_storage/src/test/regress/expected/groupingsets.out @@ -1784,6 +1784,7 @@ select array(select row(v.a,s1.*) from (select two,four, count(*) from onek grou (2 rows) -- test the knapsack +set hash_mem_multiplier = 1; set enable_indexscan = false; set work_mem = '64kB'; explain (costs off) @@ -1863,6 +1864,7 @@ explain (costs off) Optimizer: Postgres query optimizer (15 rows) +reset hash_mem_multiplier; -- check collation-sensitive matching between grouping expressions -- (similar to a check for aggregates, but there are additional code -- paths for GROUPING, so check again here) diff --git a/contrib/pax_storage/src/test/regress/expected/groupingsets_optimizer.out b/contrib/pax_storage/src/test/regress/expected/groupingsets_optimizer.out index 9923ea610d2..b3da68b1f9d 100644 --- a/contrib/pax_storage/src/test/regress/expected/groupingsets_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/groupingsets_optimizer.out @@ -1847,6 +1847,7 @@ select array(select row(v.a,s1.*) from (select two,four, count(*) from onek grou (2 rows) -- test the knapsack +set hash_mem_multiplier = 1; set enable_indexscan = false; set work_mem = '64kB'; explain (costs off) @@ -1890,12 +1891,11 @@ explain (costs off) -> Streaming Partial HashAggregate Group Key: share0_ref5.hundred -> Shared Scan (share slice:id 5:0) - -> Finalize HashAggregate + -> HashAggregate Group Key: share0_ref6.thousand -> Redistribute Motion 3:3 (slice6; segments: 3) Hash Key: share0_ref6.thousand - -> Streaming Partial HashAggregate - Group Key: share0_ref6.thousand + -> Result -> Shared Scan (share slice:id 6:0) -> HashAggregate Group Key: share0_ref7.twothousand @@ -1907,7 +1907,7 @@ explain (costs off) Group Key: share0_ref8.unique1 -> Shared Scan (share slice:id 1:0) Optimizer: Pivotal Optimizer (GPORCA) -(50 rows) +(49 rows) explain (costs off) select unique1, @@ -1998,12 +1998,11 @@ explain (costs off) -> Streaming Partial HashAggregate Group Key: share0_ref5.hundred -> Shared Scan (share slice:id 5:0) - -> Finalize HashAggregate + -> HashAggregate Group Key: share0_ref6.thousand -> Redistribute Motion 3:3 (slice6; segments: 3) Hash Key: share0_ref6.thousand - -> Streaming Partial HashAggregate - Group Key: share0_ref6.thousand + -> Result -> Shared Scan (share slice:id 6:0) -> HashAggregate Group Key: share0_ref7.twothousand @@ -2015,8 +2014,9 @@ explain (costs off) Group Key: share0_ref8.unique1 -> Shared Scan (share slice:id 1:0) Optimizer: Pivotal Optimizer (GPORCA) -(50 rows) +(49 rows) +reset hash_mem_multiplier; -- check collation-sensitive matching between grouping expressions -- (similar to a check for aggregates, but there are additional code -- paths for GROUPING, so check again here) diff --git a/contrib/pax_storage/src/test/regress/expected/incremental_analyze.out b/contrib/pax_storage/src/test/regress/expected/incremental_analyze.out index f03df4df785..1808ca8082e 100644 --- a/contrib/pax_storage/src/test/regress/expected/incremental_analyze.out +++ b/contrib/pax_storage/src/test/regress/expected/incremental_analyze.out @@ -1949,7 +1949,7 @@ SELECT staattnum, stakind1, stakind2, stakind3, stakind4, stakind5, FROM pg_statistic WHERE starelid = 'simple_table_no_hll'::regclass; staattnum | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | stavalues1 | stavalues2 | stavalues3 | stavalues4 | stavalues5 -----------+----------+----------+----------+----------+----------+--------------+------------+------------+------------+------------ - 1 | 2 | 3 | 0 | 0 | 0 | {1,3,5,7,10} | | | | + 1 | 2 | 3 | 8 | 0 | 0 | {1,3,5,7,10} | | {10} | | (1 row) -- Make sure analyze rootpartition option works in an option list diff --git a/contrib/pax_storage/src/test/regress/expected/limit.out b/contrib/pax_storage/src/test/regress/expected/limit.out index 4b6e5e1f0a6..b03d00ce77e 100644 --- a/contrib/pax_storage/src/test/regress/expected/limit.out +++ b/contrib/pax_storage/src/test/regress/expected/limit.out @@ -131,7 +131,7 @@ select * from int8_tbl offset (case when random() < 0.5 then null::bigint end); (5 rows) -- Test assorted cases involving backwards fetch from a LIMIT plan node --- Disable backward scan test which is not supported in this version of Cloudberry Database +-- Disable backward scan test which is not supported in this version of Apache Cloudberry --start_ignore /* * begin; diff --git a/contrib/pax_storage/src/test/regress/expected/limit_optimizer.out b/contrib/pax_storage/src/test/regress/expected/limit_optimizer.out index b8562703a6c..4892a3bb414 100644 --- a/contrib/pax_storage/src/test/regress/expected/limit_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/limit_optimizer.out @@ -131,7 +131,7 @@ select * from int8_tbl offset (case when random() < 0.5 then null::bigint end); (5 rows) -- Test assorted cases involving backwards fetch from a LIMIT plan node --- Disable backward scan test which is not supported in this version of Cloudberry Database +-- Disable backward scan test which is not supported in this version of Apache Cloudberry --start_ignore /* * begin; @@ -358,29 +358,28 @@ order by s2 desc; explain (verbose, costs off) select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2 from tenk1 group by thousand order by thousand limit 3; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------- Result - Output: (sum(tenthous)), (((sum(tenthous))::double precision + (random() * '0'::double precision))) + Output: (sum(tenthous)), ((((sum(tenthous)))::double precision + (random() * '0'::double precision))) -> Limit - Output: (sum(tenthous)), (((sum(tenthous))::double precision + (random() * '0'::double precision))), thousand + Output: (sum(tenthous)), ((((sum(tenthous)))::double precision + (random() * '0'::double precision))), thousand -> Gather Motion 3:1 (slice1; segments: 3) - Output: (sum(tenthous)), (((sum(tenthous))::double precision + (random() * '0'::double precision))), thousand + Output: (sum(tenthous)), ((((sum(tenthous)))::double precision + (random() * '0'::double precision))), thousand Merge Key: thousand -> Limit - Output: (sum(tenthous)), (((sum(tenthous))::double precision + (random() * '0'::double precision))), thousand - -> Finalize GroupAggregate - Output: sum(tenthous), ((sum(tenthous))::double precision + (random() * '0'::double precision)), thousand - Group Key: tenk1.thousand + Output: (sum(tenthous)), ((((sum(tenthous)))::double precision + (random() * '0'::double precision))), thousand + -> Result + Output: (sum(tenthous)), (((sum(tenthous)))::double precision + (random() * '0'::double precision)), thousand -> Sort - Output: thousand, (PARTIAL sum(tenthous)), (PARTIAL sum(tenthous)) + Output: (sum(tenthous)), (sum(tenthous)), thousand Sort Key: tenk1.thousand - -> Redistribute Motion 3:3 (slice2; segments: 3) - Output: thousand, (PARTIAL sum(tenthous)), (PARTIAL sum(tenthous)) - Hash Key: thousand - -> Streaming Partial HashAggregate - Output: thousand, PARTIAL sum(tenthous), PARTIAL sum(tenthous) - Group Key: tenk1.thousand + -> HashAggregate + Output: sum(tenthous), sum(tenthous), thousand + Group Key: tenk1.thousand + -> Redistribute Motion 3:3 (slice2; segments: 3) + Output: thousand, tenthous + Hash Key: thousand -> Seq Scan on public.tenk1 Output: thousand, tenthous Optimizer: Pivotal Optimizer (GPORCA) diff --git a/contrib/pax_storage/src/test/regress/expected/misc_functions_optimizer.out b/contrib/pax_storage/src/test/regress/expected/misc_functions_optimizer.out index 8cdcba95fb1..56138548e88 100644 --- a/contrib/pax_storage/src/test/regress/expected/misc_functions_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/misc_functions_optimizer.out @@ -235,7 +235,7 @@ WHERE my_int_eq(a.unique2, 42); -> Hash -> Seq Scan on tenk1 a Filter: my_int_eq(unique2, 42) - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (8 rows) -- With support function that knows it's int4eq, we get a different plan @@ -243,17 +243,16 @@ ALTER FUNCTION my_int_eq(int, int) SUPPORT test_support_func; EXPLAIN (COSTS OFF) SELECT * FROM tenk1 a JOIN tenk1 b ON a.unique1 = b.unique1 WHERE my_int_eq(a.unique2, 42); - QUERY PLAN ------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) - -> Hash Join - Hash Cond: (b.unique1 = a.unique1) - -> Seq Scan on tenk1 b - -> Hash - -> Seq Scan on tenk1 a - Filter: my_int_eq(unique2, 42) - Optimizer: Pivotal Optimizer (GPORCA) -(8 rows) + -> Nested Loop + -> Seq Scan on tenk1 a + Filter: my_int_eq(unique2, 42) + -> Index Scan using tenk1_unique1 on tenk1 b + Index Cond: (unique1 = a.unique1) + Optimizer: Postgres query optimizer +(7 rows) -- Also test non-default rowcount estimate CREATE FUNCTION my_gen_series(int, int) RETURNS SETOF integer @@ -262,27 +261,26 @@ CREATE FUNCTION my_gen_series(int, int) RETURNS SETOF integer SUPPORT test_support_func; EXPLAIN (COSTS OFF) SELECT * FROM tenk1 a JOIN my_gen_series(1,1000) g ON a.unique1 = g; - QUERY PLAN ------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) -> Hash Join - Hash Cond: (a.unique1 = my_gen_series.my_gen_series) - -> Seq Scan on tenk1 a + Hash Cond: (g.g = a.unique1) + -> Function Scan on my_gen_series g -> Hash - -> Function Scan on my_gen_series - Optimizer: Pivotal Optimizer (GPORCA) + -> Seq Scan on tenk1 a + Optimizer: Postgres query optimizer (7 rows) EXPLAIN (COSTS OFF) SELECT * FROM tenk1 a JOIN my_gen_series(1,5) g ON a.unique1 = g; - QUERY PLAN ------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) - -> Hash Join - Hash Cond: (a.unique1 = my_gen_series.my_gen_series) - -> Seq Scan on tenk1 a - -> Hash - -> Function Scan on my_gen_series - Optimizer: Pivotal Optimizer (GPORCA) -(7 rows) + -> Nested Loop + -> Function Scan on my_gen_series g + -> Index Scan using tenk1_unique1 on tenk1 a + Index Cond: (unique1 = g.g) + Optimizer: Postgres query optimizer +(6 rows) diff --git a/contrib/pax_storage/src/test/regress/expected/olap_plans_optimizer.out b/contrib/pax_storage/src/test/regress/expected/olap_plans_optimizer.out index fda6a3821ca..4f4915b3dd8 100644 --- a/contrib/pax_storage/src/test/regress/expected/olap_plans_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/olap_plans_optimizer.out @@ -79,18 +79,20 @@ select a, b, c, sum(d) from olap_test group by a, b, c; -- If it's not a superset, redistribution is needed. explain select a, sum(d) from olap_test group by a; - QUERY PLAN -------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.55 rows=3 width=12) - -> Finalize HashAggregate (cost=0.00..431.55 rows=1 width=12) + -> Finalize GroupAggregate (cost=0.00..431.55 rows=1 width=12) Group Key: a - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.55 rows=1 width=12) - Hash Key: a - -> Streaming Partial HashAggregate (cost=0.00..431.55 rows=1 width=12) - Group Key: a - -> Seq Scan on olap_test (cost=0.00..431.09 rows=3334 width=8) + -> Sort (cost=0.00..431.55 rows=2 width=12) + Sort Key: a + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.55 rows=2 width=12) + Hash Key: a + -> Streaming Partial HashAggregate (cost=0.00..431.55 rows=2 width=12) + Group Key: a + -> Seq Scan on olap_test (cost=0.00..431.09 rows=3334 width=8) Optimizer: GPORCA -(9 rows) +(11 rows) select a, sum(d) from olap_test group by a; a | sum @@ -185,8 +187,8 @@ set gp_motion_cost_per_row=1.0; -- If the query produces a relatively small number of groups in comparison to -- the number of input rows, two-stage aggregation will be picked. explain select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a), (b, c)); - QUERY PLAN --------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1726.51 rows=152 width=20) -> Sequence (cost=0.00..1726.50 rows=51 width=20) -> Shared Scan (share slice:id 1:0) (cost=0.00..431.19 rows=3334 width=1) @@ -199,18 +201,20 @@ explain select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a) -> Streaming Partial HashAggregate (cost=0.00..431.91 rows=44 width=16) Group Key: share0_ref2.b, share0_ref2.c -> Shared Scan (share slice:id 2:0) (cost=0.00..431.10 rows=3334 width=12) - -> Finalize HashAggregate (cost=0.00..431.47 rows=1 width=12) + -> Finalize GroupAggregate (cost=0.00..431.47 rows=1 width=12) Group Key: share0_ref3.a - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=1 width=12) - Hash Key: share0_ref3.a - -> Streaming Partial HashAggregate (cost=0.00..431.47 rows=1 width=12) - Group Key: share0_ref3.a - -> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8) + -> Sort (cost=0.00..431.47 rows=2 width=12) + Sort Key: share0_ref3.a + -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=2 width=12) + Hash Key: share0_ref3.a + -> Streaming Partial HashAggregate (cost=0.00..431.47 rows=2 width=12) + Group Key: share0_ref3.a + -> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8) -> HashAggregate (cost=0.00..431.91 rows=7 width=16) Group Key: share0_ref4.a, share0_ref4.b -> Shared Scan (share slice:id 1:0) (cost=0.00..431.10 rows=3334 width=12) Optimizer: GPORCA -(23 rows) +(25 rows) select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a), (b, c)); a | b | c | sum @@ -255,8 +259,8 @@ select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a), (b, c) -- If the query produces a relatively large number of groups in comparison to -- the number of input rows, one-stage aggregation will be picked. explain select a, b, d, sum(d) from olap_test group by grouping sets((a, b), (a), (b, d)); - QUERY PLAN ---------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1727.48 rows=10022 width=20) -> Sequence (cost=0.00..1726.74 rows=3341 width=20) -> Shared Scan (share slice:id 1:0) (cost=0.00..431.17 rows=3334 width=1) @@ -268,18 +272,20 @@ explain select a, b, d, sum(d) from olap_test group by grouping sets((a, b), (a) Hash Key: share0_ref2.b, share0_ref2.d -> Result (cost=0.00..431.06 rows=3334 width=8) -> Shared Scan (share slice:id 2:0) (cost=0.00..431.06 rows=3334 width=8) - -> Finalize HashAggregate (cost=0.00..431.47 rows=1 width=12) + -> Finalize GroupAggregate (cost=0.00..431.47 rows=1 width=12) Group Key: share0_ref3.a - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=1 width=12) - Hash Key: share0_ref3.a - -> Streaming Partial HashAggregate (cost=0.00..431.47 rows=1 width=12) - Group Key: share0_ref3.a - -> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8) + -> Sort (cost=0.00..431.47 rows=2 width=12) + Sort Key: share0_ref3.a + -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=2 width=12) + Hash Key: share0_ref3.a + -> Streaming Partial HashAggregate (cost=0.00..431.47 rows=2 width=12) + Group Key: share0_ref3.a + -> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8) -> HashAggregate (cost=0.00..431.91 rows=7 width=16) Group Key: share0_ref4.a, share0_ref4.b -> Shared Scan (share slice:id 1:0) (cost=0.00..431.10 rows=3334 width=12) Optimizer: GPORCA -(22 rows) +(24 rows) -- do not execute this query as it would produce too many tuples. -- Test that when the second-stage Agg doesn't try to preserve the @@ -292,8 +298,8 @@ explain select a, b, d, sum(d) from olap_test group by grouping sets((a, b), (a) -- from the Merge Key. set enable_hashagg=off; explain select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a), (b, c)) limit 200; - QUERY PLAN --------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- Limit (cost=0.00..1726.51 rows=152 width=20) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1726.51 rows=152 width=20) -> Sequence (cost=0.00..1726.50 rows=51 width=20) @@ -307,18 +313,20 @@ explain select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a) -> Streaming Partial HashAggregate (cost=0.00..431.91 rows=44 width=16) Group Key: share0_ref2.b, share0_ref2.c -> Shared Scan (share slice:id 2:0) (cost=0.00..431.10 rows=3334 width=12) - -> Finalize HashAggregate (cost=0.00..431.47 rows=1 width=12) + -> Finalize GroupAggregate (cost=0.00..431.47 rows=1 width=12) Group Key: share0_ref3.a - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=1 width=12) - Hash Key: share0_ref3.a - -> Streaming Partial HashAggregate (cost=0.00..431.47 rows=1 width=12) - Group Key: share0_ref3.a - -> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8) + -> Sort (cost=0.00..431.47 rows=2 width=12) + Sort Key: share0_ref3.a + -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=2 width=12) + Hash Key: share0_ref3.a + -> Streaming Partial HashAggregate (cost=0.00..431.47 rows=2 width=12) + Group Key: share0_ref3.a + -> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8) -> HashAggregate (cost=0.00..431.91 rows=7 width=16) Group Key: share0_ref4.a, share0_ref4.b -> Shared Scan (share slice:id 1:0) (cost=0.00..431.10 rows=3334 width=12) Optimizer: GPORCA -(24 rows) +(26 rows) reset enable_hashagg; -- @@ -331,48 +339,40 @@ create table foo_ctas(a int, b int) distributed randomly; insert into foo_ctas select g%5, g%2 from generate_series(1, 100) g; analyze foo_ctas; explain create table bar_ctas as select * from foo_ctas group by a, b distributed by (b); - QUERY PLAN ------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------- Result (cost=0.00..431.10 rows=6 width=8) -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.01 rows=6 width=8) Hash Key: b -> GroupAggregate (cost=0.00..431.01 rows=2 width=8) Group Key: a, b - -> Sort (cost=0.00..431.01 rows=2 width=8) + -> Sort (cost=0.00..431.01 rows=34 width=8) Sort Key: a, b - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.01 rows=2 width=8) + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=34 width=8) Hash Key: a, b - -> GroupAggregate (cost=0.00..431.01 rows=2 width=8) - Group Key: a, b - -> Sort (cost=0.00..431.01 rows=34 width=8) - Sort Key: a, b - -> Seq Scan on foo_ctas (cost=0.00..431.00 rows=34 width=8) - Optimizer: Pivotal Optimizer (GPORCA) -(15 rows) + -> Seq Scan on foo_ctas (cost=0.00..431.00 rows=34 width=8) + Optimizer: GPORCA +(11 rows) create table bar_ctas as select * from foo_ctas group by a, b distributed by (b); -- Currently, the planner misses this optimization with INSERT, so this -- needs an extra Redistribute Motion. explain insert into bar_ctas select * from foo_ctas group by a, b; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- Insert on bar_ctas (cost=0.00..431.10 rows=2 width=8) -> Result (cost=0.00..431.01 rows=6 width=12) -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.01 rows=6 width=8) Hash Key: foo_ctas.b -> GroupAggregate (cost=0.00..431.01 rows=2 width=8) Group Key: foo_ctas.a, foo_ctas.b - -> Sort (cost=0.00..431.01 rows=2 width=8) + -> Sort (cost=0.00..431.01 rows=34 width=8) Sort Key: foo_ctas.a, foo_ctas.b - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.01 rows=2 width=8) + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=34 width=8) Hash Key: foo_ctas.a, foo_ctas.b - -> GroupAggregate (cost=0.00..431.01 rows=2 width=8) - Group Key: foo_ctas.a, foo_ctas.b - -> Sort (cost=0.00..431.01 rows=34 width=8) - Sort Key: foo_ctas.a, foo_ctas.b - -> Seq Scan on foo_ctas (cost=0.00..431.00 rows=34 width=8) - Optimizer: Pivotal Optimizer (GPORCA) -(16 rows) + -> Seq Scan on foo_ctas (cost=0.00..431.00 rows=34 width=8) + Optimizer: GPORCA +(12 rows) drop table foo_ctas; drop table bar_ctas; diff --git a/contrib/pax_storage/src/test/regress/expected/qp_misc.out b/contrib/pax_storage/src/test/regress/expected/qp_misc.out index 5dbf5419ad6..28adde90fe9 100644 --- a/contrib/pax_storage/src/test/regress/expected/qp_misc.out +++ b/contrib/pax_storage/src/test/regress/expected/qp_misc.out @@ -745,7 +745,7 @@ f1,f2,f3 -- SubqueryInCase_p1 -- test expected to fail until function supported in GPDB --- GPDB Limitation ERROR: Cloudberry Database does not yet support that query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support that query. DETAIL: The query contains a multi-row subquery. select 'SubqueryInCase_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1,f2,f3, count(*) c from ( @@ -848,7 +848,7 @@ f1,f2,f3 -- SubqueryPredicateNotIn_p1 -- test expected to fail until function supported in GPDB --- GPDB Limitation ERROR: Cloudberry Database does not yet support that query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support that query. DETAIL: The query contains a multi-row subquery. select 'SubqueryPredicateNotIn_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1,f2,f3, count(*) c from ( @@ -923,7 +923,7 @@ f1,f2,f3 -- SubqueryQuantifiedPredicateEmpty_p1 -- test expected to fail until GPDB support this function --- GPDB Limitation ERROR: Cloudberry Database does not yet support this query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support this query. DETAIL: The query contains a multi-row subquery. select 'SubqueryQuantifiedPredicateEmpty_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1,f2,f3, count(*) c from ( @@ -943,7 +943,7 @@ f1,f2,f3 -- SubqueryQuantifiedPredicateLarge_p1 -- test expected to fail until GPDB supports this function --- GPDB Limitation ERROR: Cloudberry Database does not yet support that query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support that query. DETAIL: The query contains a multi-row subquery. select 'SubqueryQuantifiedPredicateLarge_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1,f2,f3, count(*) c from ( @@ -2732,7 +2732,7 @@ f1 -- SubqueryQuantifiedPredicateNull_gp_p1 -- test expected to fail until GPDB support function --- GPDB Limitation ERROR: Cloudberry Database does not yet support this query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support this query. DETAIL: The query contains a multi-row subquery. select 'SubqueryQuantifiedPredicateNull_gp_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1, count(*) c from ( @@ -2749,7 +2749,7 @@ f1 -- SubqueryQuantifiedPredicateSmall_gp_p1 -- test expected to fail until GPDB supports function --- GPDB Limitation ERROR: Cloudberry Database does not yet support this query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support this query. DETAIL: The query contains a multi-row subquery. select 'SubqueryQuantifiedPredicateSmall_gp_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1, count(*) c from ( diff --git a/contrib/pax_storage/src/test/regress/expected/qp_misc_rio_join_small.out b/contrib/pax_storage/src/test/regress/expected/qp_misc_rio_join_small.out index 9c38612fbcd..334180e2645 100644 --- a/contrib/pax_storage/src/test/regress/expected/qp_misc_rio_join_small.out +++ b/contrib/pax_storage/src/test/regress/expected/qp_misc_rio_join_small.out @@ -31,10 +31,10 @@ CREATE TABLE my_tt_agg_small ( -- COPY my_tt_agg_small (symbol, event_ts, trade_price, trade_volume) FROM stdin; -- --- Cloudberry Database database dump complete +-- Apache Cloudberry database dump complete -- -- --- Cloudberry Database database dump +-- Apache Cloudberry database dump -- -- -- Name: my_tq_agg_small; Type: TABLE; Schema: public; Tablespace: @@ -55,7 +55,7 @@ COPY my_tq_agg_small (ets, sym, bid_price, ask_price, end_ts) FROM stdin; -- CREATE INDEX my_tq_agg_small_ets_end_ts_ix ON my_tq_agg_small USING btree (ets, end_ts); -- --- Cloudberry Database database dump complete +-- Apache Cloudberry database dump complete -- set enable_hashjoin=off; create index my_tt_agg_small_event_ts_ix on my_tt_agg_small(event_ts); diff --git a/contrib/pax_storage/src/test/regress/expected/qp_with_clause.out b/contrib/pax_storage/src/test/regress/expected/qp_with_clause.out index 2a496cc0f7a..f2b67df8d8f 100644 --- a/contrib/pax_storage/src/test/regress/expected/qp_with_clause.out +++ b/contrib/pax_storage/src/test/regress/expected/qp_with_clause.out @@ -11089,7 +11089,7 @@ ORDER BY 1, 2, 3, 4 DESC LIMIT 25; 35NAME | 7 | 5NAME | 1 (25 rows) --- Test that SharedInputScan within the same slice is always executed +-- Test that ShareInputScan within the same slice is always executed set gp_cte_sharing=on; -- start_ignore CREATE TABLE car (a int, b int); diff --git a/contrib/pax_storage/src/test/regress/expected/query_finish_pending.out b/contrib/pax_storage/src/test/regress/expected/query_finish_pending.out index cd162170e13..f075a0f75a2 100644 --- a/contrib/pax_storage/src/test/regress/expected/query_finish_pending.out +++ b/contrib/pax_storage/src/test/regress/expected/query_finish_pending.out @@ -92,7 +92,7 @@ select gp_inject_fault('execsort_sort_bounded_heap', 'status', 2); (1 row) --- test if shared input scan deletes memory correctly when QueryFinishPending and its child has been eagerly freed, +-- test if Share Input Scan deletes memory correctly when QueryFinishPending and its child has been eagerly freed, -- where the child is a Sort node drop table if exists testsisc; NOTICE: table "testsisc" does not exist, skipping @@ -104,7 +104,7 @@ insert into testsisc select i, i % 1000, i % 100000, i % 75 from (select count(*) as nsegments from gp_segment_configuration where role='p' and content >= 0) foo) bar; set gp_resqueue_print_operator_memory_limits=on; set statement_mem='2MB'; --- ORCA does not generate SharedInputScan with a Sort node underneath it. For +-- ORCA does not generate ShareInputScan with a Sort node underneath it. For -- the following query, ORCA disregards the order by inside the cte definition; -- planner on the other hand does not. set optimizer=off; @@ -114,8 +114,8 @@ select gp_inject_fault('execshare_input_next', 'reset', 2); Success: (1 row) --- Set QueryFinishPending to true after SharedInputScan has retrieved the first tuple. --- This will eagerly free the memory context of shared input scan's child node. +-- Set QueryFinishPending to true after ShareInputScan has retrieved the first tuple. +-- This will eagerly free the memory context of Share Input Scan's child node. select gp_inject_fault('execshare_input_next', 'finish_pending', 2); gp_inject_fault ----------------- @@ -137,7 +137,7 @@ select gp_inject_fault('execshare_input_next', 'status', 2); (1 row) --- test if shared input scan deletes memory correctly when QueryFinishPending and its child has been eagerly freed, +-- test if Share Input Scan deletes memory correctly when QueryFinishPending and its child has been eagerly freed, -- where the child is a Sort node and sort_mk algorithm is used select gp_inject_fault('execshare_input_next', 'reset', 2); gp_inject_fault @@ -145,8 +145,8 @@ select gp_inject_fault('execshare_input_next', 'reset', 2); Success: (1 row) --- Set QueryFinishPending to true after SharedInputScan has retrieved the first tuple. --- This will eagerly free the memory context of shared input scan's child node. +-- Set QueryFinishPending to true after ShareInputScan has retrieved the first tuple. +-- This will eagerly free the memory context of Share Input Scan's child node. select gp_inject_fault('execshare_input_next', 'finish_pending', 2); gp_inject_fault ----------------- diff --git a/contrib/pax_storage/src/test/regress/expected/rowsecurity.out b/contrib/pax_storage/src/test/regress/expected/rowsecurity.out index 4cd81f77e1e..60b9a3ed711 100644 --- a/contrib/pax_storage/src/test/regress/expected/rowsecurity.out +++ b/contrib/pax_storage/src/test/regress/expected/rowsecurity.out @@ -6,7 +6,7 @@ SET client_min_messages TO 'warning'; SET gp_enable_relsize_collection to on; -- Pax filter will call the f_leak, then output is not right -set pax_enable_sparse_filter to off; +set pax.enable_sparse_filter to off; DROP USER IF EXISTS regress_rls_alice; DROP USER IF EXISTS regress_rls_bob; DROP USER IF EXISTS regress_rls_carol; @@ -1878,15 +1878,15 @@ AND f_leak(t2_1.b) AND f_leak(t2_2.b) RETURNING *, t2_1, t2_2; -> Update on t2 t2_1 -> Explicit Redistribute Motion 3:3 (slice2; segments: 3) -> Hash Join - Hash Cond: (t2_2.b = t2_1.b) + Hash Cond: (t2_1.b = t2_2.b) -> Redistribute Motion 3:3 (slice3; segments: 3) - Hash Key: t2_2.b - -> Seq Scan on t2 t2_2 + Hash Key: t2_1.b + -> Seq Scan on t2 t2_1 Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -> Hash -> Redistribute Motion 3:3 (slice4; segments: 3) - Hash Key: t2_1.b - -> Seq Scan on t2 t2_1 + Hash Key: t2_2.b + -> Seq Scan on t2 t2_2 Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) Optimizer: Postgres query optimizer (15 rows) @@ -4148,4 +4148,4 @@ CREATE POLICY p1 ON rls_tbl_force USING (c1 = 5) WITH CHECK (c1 < 5); CREATE POLICY p2 ON rls_tbl_force FOR SELECT USING (c1 = 8); CREATE POLICY p3 ON rls_tbl_force FOR UPDATE USING (c1 = 8) WITH CHECK (c1 >= 5); CREATE POLICY p4 ON rls_tbl_force FOR DELETE USING (c1 = 8); -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/src/test/regress/expected/rowsecurity_optimizer.out b/contrib/pax_storage/src/test/regress/expected/rowsecurity_optimizer.out index da828356739..c5229896acf 100644 --- a/contrib/pax_storage/src/test/regress/expected/rowsecurity_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/rowsecurity_optimizer.out @@ -6,7 +6,7 @@ SET client_min_messages TO 'warning'; SET gp_enable_relsize_collection to on; -- Pax filter will call the f_leak, then output is not right -set pax_enable_sparse_filter to off; +set pax.enable_sparse_filter to off; DROP USER IF EXISTS regress_rls_alice; DROP USER IF EXISTS regress_rls_bob; DROP USER IF EXISTS regress_rls_carol; @@ -1878,15 +1878,15 @@ AND f_leak(t2_1.b) AND f_leak(t2_2.b) RETURNING *, t2_1, t2_2; -> Update on t2 t2_1 -> Explicit Redistribute Motion 3:3 (slice2; segments: 3) -> Hash Join - Hash Cond: (t2_2.b = t2_1.b) + Hash Cond: (t2_1.b = t2_2.b) -> Redistribute Motion 3:3 (slice3; segments: 3) - Hash Key: t2_2.b - -> Seq Scan on t2 t2_2 + Hash Key: t2_1.b + -> Seq Scan on t2 t2_1 Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) -> Hash -> Redistribute Motion 3:3 (slice4; segments: 3) - Hash Key: t2_1.b - -> Seq Scan on t2 t2_1 + Hash Key: t2_2.b + -> Seq Scan on t2 t2_2 Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b)) Optimizer: Postgres query optimizer (15 rows) @@ -4162,4 +4162,4 @@ CREATE POLICY p1 ON rls_tbl_force USING (c1 = 5) WITH CHECK (c1 < 5); CREATE POLICY p2 ON rls_tbl_force FOR SELECT USING (c1 = 8); CREATE POLICY p3 ON rls_tbl_force FOR UPDATE USING (c1 = 8) WITH CHECK (c1 >= 5); CREATE POLICY p4 ON rls_tbl_force FOR DELETE USING (c1 = 8); -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/src/test/regress/expected/select_views.out b/contrib/pax_storage/src/test/regress/expected/select_views.out index 0865c2638db..5b87a438ca8 100644 --- a/contrib/pax_storage/src/test/regress/expected/select_views.out +++ b/contrib/pax_storage/src/test/regress/expected/select_views.out @@ -2,7 +2,7 @@ -- SELECT_VIEWS -- test the views defined in CREATE_VIEWS -- -set pax_enable_sparse_filter to off; +set pax.enable_sparse_filter to off; SELECT * FROM street; name | thepath | cname ------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------- @@ -1581,4 +1581,4 @@ NOTICE: f_leak => hamburger (seg0 slice1 127.0.0.1:25432 pid=890) -- Cleanup. RESET SESSION AUTHORIZATION; DROP ROLE regress_alice; -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/src/test/regress/expected/select_views_optimizer.out b/contrib/pax_storage/src/test/regress/expected/select_views_optimizer.out index b4f5ce7faa7..e5691b5969a 100644 --- a/contrib/pax_storage/src/test/regress/expected/select_views_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/select_views_optimizer.out @@ -2,7 +2,7 @@ -- SELECT_VIEWS -- test the views defined in CREATE_VIEWS -- -set pax_enable_sparse_filter to off; +set pax.enable_sparse_filter to off; SELECT * FROM street; name | thepath | cname ------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------- @@ -1589,4 +1589,4 @@ NOTICE: f_leak => hamburger (seg1 slice1 127.0.0.1:7003 pid=16817) -- Cleanup. RESET SESSION AUTHORIZATION; DROP ROLE regress_alice; -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/src/test/regress/expected/shared_scan_optimizer.out b/contrib/pax_storage/src/test/regress/expected/shared_scan_optimizer.out index a8ab8bd0c23..a256c71063a 100644 --- a/contrib/pax_storage/src/test/regress/expected/shared_scan_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/shared_scan_optimizer.out @@ -111,7 +111,7 @@ WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 LIMIT 10) SELECT a, 1, 1 FROM -> Shared Scan (share slice:id 3:0) -> Hash -> Seq Scan on t2 - Optimizer: Pivotal Optimizer (GPORCA) + Optimizer: GPORCA (17 rows) -- This functions returns one more column than expected. diff --git a/contrib/pax_storage/src/test/regress/expected/stats_ext.out b/contrib/pax_storage/src/test/regress/expected/stats_ext.out index 1891db48e66..9de552fb0aa 100644 --- a/contrib/pax_storage/src/test/regress/expected/stats_ext.out +++ b/contrib/pax_storage/src/test/regress/expected/stats_ext.out @@ -4,7 +4,7 @@ -- roughly the same estimates as in upstream. set gp_selectivity_damping_for_scans = off; set gp_selectivity_damping_for_joins = off; -set pax_enable_sparse_filter to off; +set pax.enable_sparse_filter to off; -- Set constraint_exclusion to partition as same as upstream to make test pass. set constraint_exclusion to 'partition'; -- @@ -3233,4 +3233,4 @@ NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to table tststats.priv_test_tbl drop cascades to view tststats.priv_test_view DROP USER regress_stats_user1; -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/src/test/regress/expected/stats_ext_optimizer.out b/contrib/pax_storage/src/test/regress/expected/stats_ext_optimizer.out index 46a9c4b4273..a3818d9bf0c 100644 --- a/contrib/pax_storage/src/test/regress/expected/stats_ext_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/stats_ext_optimizer.out @@ -4,7 +4,7 @@ -- roughly the same estimates as in upstream. set gp_selectivity_damping_for_scans = off; set gp_selectivity_damping_for_joins = off; -set pax_enable_sparse_filter to off; +set pax.enable_sparse_filter to off; -- Set constraint_exclusion to partition as same as upstream to make test pass. set constraint_exclusion to 'partition'; -- @@ -3268,4 +3268,4 @@ NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to table tststats.priv_test_tbl drop cascades to view tststats.priv_test_view DROP USER regress_stats_user1; -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/src/test/regress/expected/subselect_gp_optimizer.out b/contrib/pax_storage/src/test/regress/expected/subselect_gp_optimizer.out index 41ab2a59a70..8c4b96b6be8 100644 --- a/contrib/pax_storage/src/test/regress/expected/subselect_gp_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/subselect_gp_optimizer.out @@ -1643,110 +1643,102 @@ EXPLAIN SELECT '' AS five, f1 AS "Correlated Field" EXPLAIN select count(*) from (select 1 from tenk1 a where unique1 IN (select hundred from tenk1 b)) ss; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------- Finalize Aggregate (cost=0.00..826.06 rows=1 width=8) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..826.06 rows=1 width=8) -> Partial Aggregate (cost=0.00..826.06 rows=1 width=8) -> Nested Loop (cost=0.00..826.06 rows=34 width=1) Join Filter: true - -> GroupAggregate (cost=0.00..431.94 rows=34 width=4) + -> HashAggregate (cost=0.00..431.95 rows=34 width=4) Group Key: b.hundred - -> Sort (cost=0.00..431.94 rows=34 width=4) - Sort Key: b.hundred - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.94 rows=34 width=4) - Hash Key: b.hundred - -> Streaming HashAggregate (cost=0.00..431.94 rows=34 width=4) - Group Key: b.hundred - -> Seq Scan on tenk1 b (cost=0.00..431.51 rows=3334 width=4) + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.94 rows=100 width=4) + Hash Key: b.hundred + -> Streaming HashAggregate (cost=0.00..431.94 rows=100 width=4) + Group Key: b.hundred + -> Seq Scan on tenk1 b (cost=0.00..431.51 rows=3334 width=4) -> Bitmap Heap Scan on tenk1 a (cost=0.00..394.11 rows=1 width=1) Recheck Cond: (unique1 = b.hundred) -> Bitmap Index Scan on tenk1_unique1 (cost=0.00..0.00 rows=0 width=0) Index Cond: (unique1 = b.hundred) Optimizer: GPORCA -(19 rows) +(17 rows) EXPLAIN select count(distinct ss.ten) from (select ten from tenk1 a where unique1 IN (select hundred from tenk1 b)) ss; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=0.00..826.06 rows=1 width=8) - -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..826.06 rows=1 width=8) - -> Partial Aggregate (cost=0.00..826.06 rows=1 width=8) - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..826.06 rows=34 width=4) + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=0.00..826.07 rows=1 width=8) + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..826.07 rows=1 width=8) + -> Partial Aggregate (cost=0.00..826.07 rows=1 width=8) + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..826.07 rows=34 width=4) Hash Key: a.ten - -> Nested Loop (cost=0.00..826.06 rows=34 width=4) + -> Nested Loop (cost=0.00..826.07 rows=34 width=4) Join Filter: true - -> GroupAggregate (cost=0.00..431.94 rows=34 width=4) + -> HashAggregate (cost=0.00..431.95 rows=34 width=4) Group Key: b.hundred - -> Sort (cost=0.00..431.94 rows=34 width=4) - Sort Key: b.hundred - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.94 rows=34 width=4) - Hash Key: b.hundred - -> Streaming HashAggregate (cost=0.00..431.94 rows=34 width=4) - Group Key: b.hundred - -> Seq Scan on tenk1 b (cost=0.00..431.51 rows=3334 width=4) + -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.94 rows=100 width=4) + Hash Key: b.hundred + -> Streaming HashAggregate (cost=0.00..431.94 rows=100 width=4) + Group Key: b.hundred + -> Seq Scan on tenk1 b (cost=0.00..431.51 rows=3334 width=4) -> Bitmap Heap Scan on tenk1 a (cost=0.00..394.12 rows=1 width=4) Recheck Cond: (unique1 = b.hundred) -> Bitmap Index Scan on tenk1_unique1 (cost=0.00..0.00 rows=0 width=0) Index Cond: (unique1 = b.hundred) Optimizer: GPORCA -(21 rows) +(19 rows) EXPLAIN select count(*) from (select 1 from tenk1 a where unique1 IN (select distinct hundred from tenk1 b)) ss; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------- Finalize Aggregate (cost=0.00..826.06 rows=1 width=8) -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..826.06 rows=1 width=8) -> Partial Aggregate (cost=0.00..826.06 rows=1 width=8) -> Nested Loop (cost=0.00..826.06 rows=34 width=1) Join Filter: true - -> GroupAggregate (cost=0.00..431.94 rows=34 width=4) + -> HashAggregate (cost=0.00..431.95 rows=34 width=4) Group Key: b.hundred - -> Sort (cost=0.00..431.94 rows=34 width=4) - Sort Key: b.hundred - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.94 rows=34 width=4) - Hash Key: b.hundred - -> Streaming HashAggregate (cost=0.00..431.94 rows=34 width=4) - Group Key: b.hundred - -> Seq Scan on tenk1 b (cost=0.00..431.51 rows=3334 width=4) + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.94 rows=100 width=4) + Hash Key: b.hundred + -> Streaming HashAggregate (cost=0.00..431.94 rows=100 width=4) + Group Key: b.hundred + -> Seq Scan on tenk1 b (cost=0.00..431.51 rows=3334 width=4) -> Bitmap Heap Scan on tenk1 a (cost=0.00..394.11 rows=1 width=1) Recheck Cond: (unique1 = b.hundred) -> Bitmap Index Scan on tenk1_unique1 (cost=0.00..0.00 rows=0 width=0) Index Cond: (unique1 = b.hundred) Optimizer: GPORCA -(19 rows) +(17 rows) EXPLAIN select count(distinct ss.ten) from (select ten from tenk1 a where unique1 IN (select distinct hundred from tenk1 b)) ss; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=0.00..826.06 rows=1 width=8) - -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..826.06 rows=1 width=8) - -> Partial Aggregate (cost=0.00..826.06 rows=1 width=8) - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..826.06 rows=34 width=4) + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=0.00..826.07 rows=1 width=8) + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..826.07 rows=1 width=8) + -> Partial Aggregate (cost=0.00..826.07 rows=1 width=8) + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..826.07 rows=34 width=4) Hash Key: a.ten - -> Nested Loop (cost=0.00..826.06 rows=34 width=4) + -> Nested Loop (cost=0.00..826.07 rows=34 width=4) Join Filter: true - -> GroupAggregate (cost=0.00..431.94 rows=34 width=4) + -> HashAggregate (cost=0.00..431.95 rows=34 width=4) Group Key: b.hundred - -> Sort (cost=0.00..431.94 rows=34 width=4) - Sort Key: b.hundred - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.94 rows=34 width=4) - Hash Key: b.hundred - -> Streaming HashAggregate (cost=0.00..431.94 rows=34 width=4) - Group Key: b.hundred - -> Seq Scan on tenk1 b (cost=0.00..431.51 rows=3334 width=4) + -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.94 rows=100 width=4) + Hash Key: b.hundred + -> Streaming HashAggregate (cost=0.00..431.94 rows=100 width=4) + Group Key: b.hundred + -> Seq Scan on tenk1 b (cost=0.00..431.51 rows=3334 width=4) -> Bitmap Heap Scan on tenk1 a (cost=0.00..394.12 rows=1 width=4) Recheck Cond: (unique1 = b.hundred) -> Bitmap Index Scan on tenk1_unique1 (cost=0.00..0.00 rows=0 width=0) Index Cond: (unique1 = b.hundred) Optimizer: GPORCA -(21 rows) +(19 rows) -- -- In case of simple exists query, planner can generate alternative diff --git a/contrib/pax_storage/src/test/regress/expected/tsrf_optimizer.out b/contrib/pax_storage/src/test/regress/expected/tsrf_optimizer.out index 187b086d4cb..90a3a352a98 100644 --- a/contrib/pax_storage/src/test/regress/expected/tsrf_optimizer.out +++ b/contrib/pax_storage/src/test/regress/expected/tsrf_optimizer.out @@ -88,15 +88,16 @@ ANALYZE few; -- SRF with a provably-dummy relation explain (verbose, costs off) SELECT unnest(ARRAY[1, 2]) FROM few WHERE false; - QUERY PLAN --------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------- ProjectSet Output: unnest('{1,2}'::integer[]) -> Result - Output: NULL::integer, NULL::tid, NULL::xid, NULL::cid, NULL::xid, NULL::cid, NULL::oid, NULL::integer, NULL::oid + Output: NULL::integer, NULL::tid, NULL::oid, NULL::integer, NULL::oid One-Time Filter: false - Optimizer: Pivotal Optimizer (GPORCA) -(6 rows) + Settings: optimizer = 'on' + Optimizer: GPORCA +(7 rows) SELECT unnest(ARRAY[1, 2]) FROM few WHERE false; unnest @@ -117,12 +118,12 @@ SELECT * FROM few f1, -> ProjectSet Output: unnest('{1,2}'::integer[]) -> Result - Output: NULL::integer, NULL::tid, NULL::xid, NULL::cid, NULL::xid, NULL::cid, NULL::oid, NULL::integer, NULL::oid + Output: NULL::integer, NULL::tid, NULL::oid, NULL::integer, NULL::oid One-Time Filter: false -> Seq Scan on public.few f1 Output: id, dataa, datab - Settings: enable_parallel = 'off', optimizer = 'on' - Optimizer: Pivotal Optimizer (GPORCA) + Settings: optimizer = 'on' + Optimizer: GPORCA (14 rows) SELECT * FROM few f1, diff --git a/contrib/pax_storage/src/test/regress/input/external_table.source b/contrib/pax_storage/src/test/regress/input/external_table.source index ea018e581bf..e4d57c1ae83 100644 --- a/contrib/pax_storage/src/test/regress/input/external_table.source +++ b/contrib/pax_storage/src/test/regress/input/external_table.source @@ -1365,10 +1365,22 @@ SELECT COUNT(*) FROM gp_read_error_log('exttab_heap_join_1'); \! rm @abs_srcdir@/data/tableless.csv -- start_ignore +DROP EXTERNAL TABLE IF EXISTS ext_nation_on_coordinator; DROP EXTERNAL TABLE IF EXISTS exttab_with_on_coordinator; -- end_ignore --- Create external table with on clause +-- Create external table on coordinator +-- good, should fetch data from coordinator +CREATE EXTERNAL TABLE ext_nation_on_coordinator ( N_NATIONKEY INTEGER , + N_NAME CHAR(25) , + N_REGIONKEY INTEGER , + N_COMMENT VARCHAR(152)) +location ('file://@hostname@@abs_srcdir@/data/nation.tbl' ) ON COORDINATOR +FORMAT 'text' (delimiter '|'); +SELECT gp_segment_id, * FROM ext_nation_on_coordinator ORDER BY N_NATIONKEY DESC LIMIT 5; +DROP EXTERNAL TABLE IF EXISTS ext_nation_on_coordinator; + +-- bad, should error CREATE EXTERNAL TABLE exttab_with_on_coordinator( i int, j text ) LOCATION ('file://@hostname@@abs_srcdir@/data/exttab_few_errors.data') ON COORDINATOR FORMAT 'TEXT' (DELIMITER '|'); diff --git a/contrib/pax_storage/src/test/regress/input/temp_tablespaces.source b/contrib/pax_storage/src/test/regress/input/temp_tablespaces.source index c3d19970517..c36522fe90d 100644 --- a/contrib/pax_storage/src/test/regress/input/temp_tablespaces.source +++ b/contrib/pax_storage/src/test/regress/input/temp_tablespaces.source @@ -45,7 +45,7 @@ CREATE TABLE tts_foo (i int, j int) distributed by(i); insert into tts_foo select i, i from generate_series(1,80000)i; ANALYZE tts_foo; set gp_cte_sharing=on; --- CBDB_PARALLEL_FIXME: since we disabled shared input scan in parallel mode, sisc_xslice_temp_files +-- CBDB_PARALLEL_FIXME: since we disabled Share Input Scan in parallel mode, sisc_xslice_temp_files -- will never be triggered. We need set max_parallel_workers_per_gather to 0 in this case. set max_parallel_workers_per_gather = 0; diff --git a/contrib/pax_storage/src/test/regress/mem_quota_util.py b/contrib/pax_storage/src/test/regress/mem_quota_util.py index 6e38a380029..7d830530c24 100755 --- a/contrib/pax_storage/src/test/regress/mem_quota_util.py +++ b/contrib/pax_storage/src/test/regress/mem_quota_util.py @@ -17,7 +17,7 @@ from multiprocessing import Process from gppylib.commands import unix except Exception as e: - sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) + sys.exit('Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(e)) def parseargs( help=False ): parser = OptionParser() diff --git a/contrib/pax_storage/src/test/regress/output/external_table.source b/contrib/pax_storage/src/test/regress/output/external_table.source index 19791a992c3..53c72200f01 100644 --- a/contrib/pax_storage/src/test/regress/output/external_table.source +++ b/contrib/pax_storage/src/test/regress/output/external_table.source @@ -2601,14 +2601,36 @@ SELECT COUNT(*) FROM gp_read_error_log('exttab_heap_join_1'); \! rm @abs_srcdir@/data/tableless.csv -- start_ignore +DROP EXTERNAL TABLE IF EXISTS ext_nation_on_coordinator; +NOTICE: table "ext_nation_on_coordinator" does not exist, skipping DROP EXTERNAL TABLE IF EXISTS exttab_with_on_coordinator; NOTICE: table "exttab_with_on_coordinator" does not exist, skipping -- end_ignore --- Create external table with on clause +-- Create external table on coordinator +-- good, should fetch data from coordinator +CREATE EXTERNAL TABLE ext_nation_on_coordinator ( N_NATIONKEY INTEGER , + N_NAME CHAR(25) , + N_REGIONKEY INTEGER , + N_COMMENT VARCHAR(152)) +location ('file://@hostname@@abs_srcdir@/data/nation.tbl' ) ON COORDINATOR +FORMAT 'text' (delimiter '|'); +SELECT gp_segment_id, * FROM ext_nation_on_coordinator ORDER BY N_NATIONKEY DESC LIMIT 5; + gp_segment_id | n_nationkey | n_name | n_regionkey | n_comment +---------------+-------------+---------------------------+-------------+---------------------------------------------------------------------------------------------------------------- + -1 | 24 | UNITED STATES | 1 | y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be + -1 | 23 | UNITED KINGDOM | 3 | eans boost carefully special requests. accounts are. carefull + -1 | 22 | RUSSIA | 3 | requests against the platelets use never according to the quickly regular pint + -1 | 21 | VIETNAM | 2 | hely enticingly express accounts. even, final + -1 | 20 | SAUDI ARABIA | 4 | ts. silent requests haggle. closely express packages sleep across the blithely +(5 rows) + +DROP EXTERNAL TABLE IF EXISTS ext_nation_on_coordinator; +-- bad, should error CREATE EXTERNAL TABLE exttab_with_on_coordinator( i int, j text ) LOCATION ('file://@hostname@@abs_srcdir@/data/exttab_few_errors.data') ON COORDINATOR FORMAT 'TEXT' (DELIMITER '|'); SELECT * FROM exttab_with_on_coordinator; -ERROR: 'ON COORDINATOR' is not supported by this protocol yet +ERROR: invalid input syntax for type integer: "error_0" +CONTEXT: External table exttab_with_on_coordinator, line 3 of file://@hostname@@abs_srcdir@/data/exttab_few_errors.data, column i DROP EXTERNAL TABLE IF EXISTS exttab_with_on_coordinator; -- start_ignore DROP EXTERNAL TABLE IF EXISTS exttab_with_option_empty; diff --git a/contrib/pax_storage/src/test/regress/output/external_table_optimizer.source b/contrib/pax_storage/src/test/regress/output/external_table_optimizer.source index 68523e63aae..6c4f884d292 100644 --- a/contrib/pax_storage/src/test/regress/output/external_table_optimizer.source +++ b/contrib/pax_storage/src/test/regress/output/external_table_optimizer.source @@ -2601,14 +2601,36 @@ SELECT COUNT(*) FROM gp_read_error_log('exttab_heap_join_1'); \! rm @abs_srcdir@/data/tableless.csv -- start_ignore +DROP EXTERNAL TABLE IF EXISTS ext_nation_on_coordinator; +NOTICE: table "ext_nation_on_coordinator" does not exist, skipping DROP EXTERNAL TABLE IF EXISTS exttab_with_on_coordinator; NOTICE: table "exttab_with_on_coordinator" does not exist, skipping -- end_ignore --- Create external table with on clause +-- Create external table on coordinator +-- good, should fetch data from coordinator +CREATE EXTERNAL TABLE ext_nation_on_coordinator ( N_NATIONKEY INTEGER , + N_NAME CHAR(25) , + N_REGIONKEY INTEGER , + N_COMMENT VARCHAR(152)) +location ('file://@hostname@@abs_srcdir@/data/nation.tbl' ) ON COORDINATOR +FORMAT 'text' (delimiter '|'); +SELECT gp_segment_id, * FROM ext_nation_on_coordinator ORDER BY N_NATIONKEY DESC LIMIT 5; + gp_segment_id | n_nationkey | n_name | n_regionkey | n_comment +---------------+-------------+---------------------------+-------------+---------------------------------------------------------------------------------------------------------------- + -1 | 24 | UNITED STATES | 1 | y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be + -1 | 23 | UNITED KINGDOM | 3 | eans boost carefully special requests. accounts are. carefull + -1 | 22 | RUSSIA | 3 | requests against the platelets use never according to the quickly regular pint + -1 | 21 | VIETNAM | 2 | hely enticingly express accounts. even, final + -1 | 20 | SAUDI ARABIA | 4 | ts. silent requests haggle. closely express packages sleep across the blithely +(5 rows) + +DROP EXTERNAL TABLE IF EXISTS ext_nation_on_coordinator; +-- bad, should error CREATE EXTERNAL TABLE exttab_with_on_coordinator( i int, j text ) LOCATION ('file://@hostname@@abs_srcdir@/data/exttab_few_errors.data') ON COORDINATOR FORMAT 'TEXT' (DELIMITER '|'); SELECT * FROM exttab_with_on_coordinator; -ERROR: 'ON COORDINATOR' is not supported by this protocol yet +ERROR: invalid input syntax for type integer: "error_0" +CONTEXT: External table exttab_with_on_coordinator, line 3 of file://@hostname@@abs_srcdir@/data/exttab_few_errors.data, column i DROP EXTERNAL TABLE IF EXISTS exttab_with_on_coordinator; -- start_ignore DROP EXTERNAL TABLE IF EXISTS exttab_with_option_empty; diff --git a/contrib/pax_storage/src/test/regress/output/temp_tablespaces.source b/contrib/pax_storage/src/test/regress/output/temp_tablespaces.source index 1f579686f3d..f11164ce226 100644 --- a/contrib/pax_storage/src/test/regress/output/temp_tablespaces.source +++ b/contrib/pax_storage/src/test/regress/output/temp_tablespaces.source @@ -56,7 +56,7 @@ CREATE TABLE tts_foo (i int, j int) distributed by(i); insert into tts_foo select i, i from generate_series(1,80000)i; ANALYZE tts_foo; set gp_cte_sharing=on; --- CBDB_PARALLEL_FIXME: since we disabled shared input scan in parallel mode, sisc_xslice_temp_files +-- CBDB_PARALLEL_FIXME: since we disabled Share Input Scan in parallel mode, sisc_xslice_temp_files -- will never be triggered. We need set max_parallel_workers_per_gather to 0 in this case. set max_parallel_workers_per_gather = 0; -- CASE 1: when temp_tablespaces is set, hashagg and share-input-scan diff --git a/contrib/pax_storage/src/test/regress/sql/agg_pushdown.sql b/contrib/pax_storage/src/test/regress/sql/agg_pushdown.sql index 3378f8b64e6..0d1e45ae992 100644 --- a/contrib/pax_storage/src/test/regress/sql/agg_pushdown.sql +++ b/contrib/pax_storage/src/test/regress/sql/agg_pushdown.sql @@ -1,5 +1,8 @@ -- disable ORCA SET optimizer TO off; +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- Test case group 1: basic functions CREATE TABLE agg_pushdown_parent ( diff --git a/contrib/pax_storage/src/test/regress/sql/aggregates.sql b/contrib/pax_storage/src/test/regress/sql/aggregates.sql index 158223e2633..136c70fb384 100644 --- a/contrib/pax_storage/src/test/regress/sql/aggregates.sql +++ b/contrib/pax_storage/src/test/regress/sql/aggregates.sql @@ -1,6 +1,9 @@ -- -- AGGREGATES -- +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- start_ignore -- end_ignore diff --git a/contrib/pax_storage/src/test/regress/sql/autostats.sql b/contrib/pax_storage/src/test/regress/sql/autostats.sql index 42831cda5f9..ce19afac413 100644 --- a/contrib/pax_storage/src/test/regress/sql/autostats.sql +++ b/contrib/pax_storage/src/test/regress/sql/autostats.sql @@ -11,7 +11,7 @@ -- end_matchignore set gp_autostats_mode=on_change; set gp_autostats_on_change_threshold=9; -set pax_enable_debug = false; +set pax.enable_debug = false; set log_autostats=on; set client_min_messages=log; reset optimizer_trace_fallback; diff --git a/contrib/pax_storage/src/test/regress/sql/bfv_aggregate.sql b/contrib/pax_storage/src/test/regress/sql/bfv_aggregate.sql index 92d90ae72d9..8bc9d97c513 100644 --- a/contrib/pax_storage/src/test/regress/sql/bfv_aggregate.sql +++ b/contrib/pax_storage/src/test/regress/sql/bfv_aggregate.sql @@ -1,5 +1,8 @@ create schema bfv_aggregate; set search_path=bfv_aggregate; +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- -- Window function with outer references in PARTITION BY/ORDER BY clause diff --git a/contrib/pax_storage/src/test/regress/sql/cbdb_parallel.sql b/contrib/pax_storage/src/test/regress/sql/cbdb_parallel.sql index ea47dfdcf60..78d2cb382dc 100644 --- a/contrib/pax_storage/src/test/regress/sql/cbdb_parallel.sql +++ b/contrib/pax_storage/src/test/regress/sql/cbdb_parallel.sql @@ -695,7 +695,7 @@ declare results1 RECORD; begin create table t_p(c1 int, c2 int) using pax with(parallel_workers=8) distributed by(c1); - set pax_max_tuples_per_file to 131072; + set pax.max_tuples_per_file to 131072; insert into t_p select i, i+1 from generate_series(1, 10000000)i; analyze t_p; if ao_row then diff --git a/contrib/pax_storage/src/test/regress/sql/create_index.sql b/contrib/pax_storage/src/test/regress/sql/create_index.sql index a0e3fc5ccdc..2cb3cc77fd0 100644 --- a/contrib/pax_storage/src/test/regress/sql/create_index.sql +++ b/contrib/pax_storage/src/test/regress/sql/create_index.sql @@ -1111,7 +1111,6 @@ COMMIT; -- we keep the `CONCURRENTLY` to make the following commands fail, -- so these commands will not cause deadlock with test create_view, -- like `drop schema xxx cascade;`. --- See more details at https://code.hashdata.xyz/cloudberry/cbdb/-/issues/54 REINDEX TABLE CONCURRENTLY pg_class; -- no catalog relation REINDEX INDEX CONCURRENTLY pg_class_oid_index; -- no catalog index -- These are the toast table and index of pg_authid. diff --git a/contrib/pax_storage/src/test/regress/sql/direct_dispatch.sql b/contrib/pax_storage/src/test/regress/sql/direct_dispatch.sql index d9e7b562a0a..783c6cf77f6 100644 --- a/contrib/pax_storage/src/test/regress/sql/direct_dispatch.sql +++ b/contrib/pax_storage/src/test/regress/sql/direct_dispatch.sql @@ -1,5 +1,8 @@ -- turn off autostats so we don't have to worry about the logging of the autostat queries set gp_autostats_mode = None; +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- create needed tables (in a transaction, for speed) begin; diff --git a/contrib/pax_storage/src/test/regress/sql/eagerfree.sql b/contrib/pax_storage/src/test/regress/sql/eagerfree.sql index 81500e4575b..e2b79d19f9a 100644 --- a/contrib/pax_storage/src/test/regress/sql/eagerfree.sql +++ b/contrib/pax_storage/src/test/regress/sql/eagerfree.sql @@ -1,5 +1,8 @@ create schema eagerfree; set search_path=eagerfree; +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore create table smallt (i int, t text, d date) distributed by (i); insert into smallt select i%10, 'text ' || (i%15), '2011-01-01'::date + ((i%20) || ' days')::interval diff --git a/contrib/pax_storage/src/test/regress/sql/gp_aggregates_costs.sql b/contrib/pax_storage/src/test/regress/sql/gp_aggregates_costs.sql index c1337de6083..8c3263a58bf 100644 --- a/contrib/pax_storage/src/test/regress/sql/gp_aggregates_costs.sql +++ b/contrib/pax_storage/src/test/regress/sql/gp_aggregates_costs.sql @@ -5,6 +5,9 @@ insert into cost_agg_t2 select i, random() * 99999, i % 300000 from generate_ser analyze cost_agg_t1; analyze cost_agg_t2; +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- -- Test planner's decisions on aggregates when only little memory is available. -- diff --git a/contrib/pax_storage/src/test/regress/sql/gp_hashagg.sql b/contrib/pax_storage/src/test/regress/sql/gp_hashagg.sql index 36000de8af8..7b880c4e62c 100644 --- a/contrib/pax_storage/src/test/regress/sql/gp_hashagg.sql +++ b/contrib/pax_storage/src/test/regress/sql/gp_hashagg.sql @@ -10,6 +10,9 @@ insert into hashagg_test values (1,1,'1/1/2006','hi',2); insert into hashagg_test values (1,1,'1/2/2006','hi',3); insert into hashagg_test values (1,1,'1/3/2006','hi',4); +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- this will get the wrong answer (right number of rows, wrong aggregates) set enable_seqscan=off; select grp,sum(v) from hashagg_test where id1 = 1 and id2 = 1 and day between '1/1/2006' and '1/31/2006' group by grp order by sum(v) desc; diff --git a/contrib/pax_storage/src/test/regress/sql/gporca.sql b/contrib/pax_storage/src/test/regress/sql/gporca.sql index 83100dd6c16..b19afd60d7b 100644 --- a/contrib/pax_storage/src/test/regress/sql/gporca.sql +++ b/contrib/pax_storage/src/test/regress/sql/gporca.sql @@ -1,6 +1,9 @@ -- -- ORCA tests -- +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- show version SELECT count(*) from gp_opt_version(); @@ -1977,11 +1980,11 @@ create table foo(a int, b int) distributed by (a); -- and log_min_duration_statement, they are the most obvious ones. set log_statement='none'; set log_min_duration_statement=-1; -set pax_enable_debug to off; +set pax.enable_debug to off; set client_min_messages='log'; explain select count(*) from foo group by cube(a,b); reset client_min_messages; -reset pax_enable_debug; +reset pax.enable_debug; reset log_statement; reset log_min_duration_statement; @@ -2054,11 +2057,11 @@ explain select * from foo where b in ('1', '2'); set optimizer_enable_ctas = off; set log_statement='none'; set log_min_duration_statement=-1; -set pax_enable_debug to off; +set pax.enable_debug to off; set client_min_messages='log'; create table foo_ctas(a) as (select generate_series(1,10)) distributed by (a); reset client_min_messages; -reset pax_enable_debug; +reset pax.enable_debug; reset log_min_duration_statement; reset log_statement; reset optimizer_enable_ctas; diff --git a/contrib/pax_storage/src/test/regress/sql/groupingsets.sql b/contrib/pax_storage/src/test/regress/sql/groupingsets.sql index 835a3cff623..294a52c2036 100644 --- a/contrib/pax_storage/src/test/regress/sql/groupingsets.sql +++ b/contrib/pax_storage/src/test/regress/sql/groupingsets.sql @@ -518,6 +518,7 @@ select array(select row(v.a,s1.*) from (select two,four, count(*) from onek grou -- test the knapsack +set hash_mem_multiplier = 1; set enable_indexscan = false; set work_mem = '64kB'; explain (costs off) @@ -541,6 +542,7 @@ explain (costs off) count(*) from tenk1 group by grouping sets (unique1,twothousand,thousand,hundred,ten,four,two); +reset hash_mem_multiplier; -- check collation-sensitive matching between grouping expressions -- (similar to a check for aggregates, but there are additional code -- paths for GROUPING, so check again here) diff --git a/contrib/pax_storage/src/test/regress/sql/incremental_analyze.sql b/contrib/pax_storage/src/test/regress/sql/incremental_analyze.sql index 5260b8b2e29..ed3ccdc7853 100644 --- a/contrib/pax_storage/src/test/regress/sql/incremental_analyze.sql +++ b/contrib/pax_storage/src/test/regress/sql/incremental_analyze.sql @@ -14,7 +14,7 @@ ALTER DATABASE incrementalanalyze SET lc_monetary TO 'C'; \c incrementalanalyze DROP SCHEMA IF EXISTS incremental_analyze; CREATE SCHEMA incremental_analyze; -set pax_enable_debug = false; +set pax.enable_debug = false; -- end_ignore -- Test ANALYZE for different data types -- Case 1: Partitions have MCVs but after merge, none of the partition MCVs diff --git a/contrib/pax_storage/src/test/regress/sql/indexjoin.sql b/contrib/pax_storage/src/test/regress/sql/indexjoin.sql index b99b561e42c..59290287e3b 100644 --- a/contrib/pax_storage/src/test/regress/sql/indexjoin.sql +++ b/contrib/pax_storage/src/test/regress/sql/indexjoin.sql @@ -4,6 +4,9 @@ CREATE TABLE my_tt_agg_small ( trade_price numeric, trade_volume bigint ) DISTRIBUTED BY (symbol); +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore CREATE TABLE my_tq_agg_small ( diff --git a/contrib/pax_storage/src/test/regress/sql/limit.sql b/contrib/pax_storage/src/test/regress/sql/limit.sql index e47b3f9bf29..ab44f3b5e36 100644 --- a/contrib/pax_storage/src/test/regress/sql/limit.sql +++ b/contrib/pax_storage/src/test/regress/sql/limit.sql @@ -37,7 +37,7 @@ select * from int8_tbl limit (case when random() < 0.5 then null::bigint end); select * from int8_tbl offset (case when random() < 0.5 then null::bigint end); -- Test assorted cases involving backwards fetch from a LIMIT plan node --- Disable backward scan test which is not supported in this version of Cloudberry Database +-- Disable backward scan test which is not supported in this version of Apache Cloudberry --start_ignore /* * begin; diff --git a/contrib/pax_storage/src/test/regress/sql/limit_gp.sql b/contrib/pax_storage/src/test/regress/sql/limit_gp.sql index 88c934c0779..0a1b4e8fd58 100644 --- a/contrib/pax_storage/src/test/regress/sql/limit_gp.sql +++ b/contrib/pax_storage/src/test/regress/sql/limit_gp.sql @@ -2,6 +2,9 @@ -- Check for MPP-19310 and MPP-19857 where mksort produces wrong result -- on OPT build, and fails assertion on debug build if a "LIMIT" query -- spills to disk. +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore CREATE TABLE mksort_limit_test_table(dkey INT, jkey INT, rval REAL, tval TEXT default repeat('abcdefghijklmnopqrstuvwxyz', 300)) DISTRIBUTED BY (dkey); INSERT INTO mksort_limit_test_table VALUES(generate_series(1, 10000), generate_series(10001, 20000), sqrt(generate_series(10001, 20000))); diff --git a/contrib/pax_storage/src/test/regress/sql/olap_plans.sql b/contrib/pax_storage/src/test/regress/sql/olap_plans.sql index c4242a34565..5b7ed221f18 100644 --- a/contrib/pax_storage/src/test/regress/sql/olap_plans.sql +++ b/contrib/pax_storage/src/test/regress/sql/olap_plans.sql @@ -2,6 +2,9 @@ -- Test the planner's ability to produce different kinds of plans to implement -- grouping and aggregation. -- +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore drop table if exists olap_test; drop table if exists olap_test_single; diff --git a/contrib/pax_storage/src/test/regress/sql/partition_aggregate.sql b/contrib/pax_storage/src/test/regress/sql/partition_aggregate.sql index 0e1ea0eec80..d93f5cffc2c 100644 --- a/contrib/pax_storage/src/test/regress/sql/partition_aggregate.sql +++ b/contrib/pax_storage/src/test/regress/sql/partition_aggregate.sql @@ -5,6 +5,9 @@ -- Note: to ensure plan stability, it's a good idea to make the partitions of -- any one partitioned table in this test all have different numbers of rows. -- +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- Disable ORCA since it does support partition-wise aggregates set optimizer to off; diff --git a/contrib/pax_storage/src/test/regress/sql/partition_join.sql b/contrib/pax_storage/src/test/regress/sql/partition_join.sql index 1d188d1476f..311384e85b2 100644 --- a/contrib/pax_storage/src/test/regress/sql/partition_join.sql +++ b/contrib/pax_storage/src/test/regress/sql/partition_join.sql @@ -2,6 +2,9 @@ -- PARTITION_JOIN -- Test partitionwise join between partitioned tables -- +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- Disable ORCA since it does support partition-wise joins set optimizer to off; diff --git a/contrib/pax_storage/src/test/regress/sql/qp_misc.sql b/contrib/pax_storage/src/test/regress/sql/qp_misc.sql index 7aacbfd5268..3d4e81761b3 100644 --- a/contrib/pax_storage/src/test/regress/sql/qp_misc.sql +++ b/contrib/pax_storage/src/test/regress/sql/qp_misc.sql @@ -1,6 +1,6 @@ -- start_ignore -- --- Cloudberry Database database dump +-- Apache Cloudberry database dump -- SET client_encoding = 'UTF8'; @@ -1153,7 +1153,7 @@ ALTER TABLE ONLY tclob ADD CONSTRAINT clobpk PRIMARY KEY (rnum); -- --- Cloudberry Database database dump complete +-- Apache Cloudberry database dump complete -- -- end_ignore @@ -1379,7 +1379,7 @@ f1,f2,f3 ) Q ) P; -- SubqueryInCase_p1 -- test expected to fail until function supported in GPDB --- GPDB Limitation ERROR: Cloudberry Database does not yet support that query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support that query. DETAIL: The query contains a multi-row subquery. select 'SubqueryInCase_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1,f2,f3, count(*) c from ( @@ -1452,7 +1452,7 @@ f1,f2,f3 ) Q ) P; -- SubqueryPredicateNotIn_p1 -- test expected to fail until function supported in GPDB --- GPDB Limitation ERROR: Cloudberry Database does not yet support that query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support that query. DETAIL: The query contains a multi-row subquery. select 'SubqueryPredicateNotIn_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1,f2,f3, count(*) c from ( @@ -1507,7 +1507,7 @@ f1,f2,f3 ) Q ) P; -- SubqueryQuantifiedPredicateEmpty_p1 -- test expected to fail until GPDB support this function --- GPDB Limitation ERROR: Cloudberry Database does not yet support this query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support this query. DETAIL: The query contains a multi-row subquery. select 'SubqueryQuantifiedPredicateEmpty_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1,f2,f3, count(*) c from ( @@ -1522,7 +1522,7 @@ f1,f2,f3 ) Q ) P; -- SubqueryQuantifiedPredicateLarge_p1 -- test expected to fail until GPDB supports this function --- GPDB Limitation ERROR: Cloudberry Database does not yet support that query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support that query. DETAIL: The query contains a multi-row subquery. select 'SubqueryQuantifiedPredicateLarge_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1,f2,f3, count(*) c from ( @@ -2801,7 +2801,7 @@ f1 ) Q ) P; -- SubqueryQuantifiedPredicateNull_gp_p1 -- test expected to fail until GPDB support function --- GPDB Limitation ERROR: Cloudberry Database does not yet support this query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support this query. DETAIL: The query contains a multi-row subquery. select 'SubqueryQuantifiedPredicateNull_gp_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1, count(*) c from ( @@ -2813,7 +2813,7 @@ f1 ) Q ) P; -- SubqueryQuantifiedPredicateSmall_gp_p1 -- test expected to fail until GPDB supports function --- GPDB Limitation ERROR: Cloudberry Database does not yet support this query. DETAIL: The query contains a multi-row subquery. +-- GPDB Limitation ERROR: Apache Cloudberry does not yet support this query. DETAIL: The query contains a multi-row subquery. select 'SubqueryQuantifiedPredicateSmall_gp_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( select f1, count(*) c from ( diff --git a/contrib/pax_storage/src/test/regress/sql/qp_misc_rio_join_small.sql b/contrib/pax_storage/src/test/regress/sql/qp_misc_rio_join_small.sql index bd20eedc202..0ababb42098 100644 --- a/contrib/pax_storage/src/test/regress/sql/qp_misc_rio_join_small.sql +++ b/contrib/pax_storage/src/test/regress/sql/qp_misc_rio_join_small.sql @@ -5,7 +5,7 @@ drop index if exists my_tt_agg_small_event_ts_ix; -- --- Cloudberry Database database dump +-- Apache Cloudberry database dump -- SET client_encoding = 'UTF8'; @@ -4042,11 +4042,11 @@ SSO 20101126114705550 415500 200 \. -- --- Cloudberry Database database dump complete +-- Apache Cloudberry database dump complete -- -- --- Cloudberry Database database dump +-- Apache Cloudberry database dump -- -- @@ -24078,7 +24078,7 @@ CREATE INDEX my_tq_agg_small_ets_end_ts_ix ON my_tq_agg_small USING btree (ets, -- --- Cloudberry Database database dump complete +-- Apache Cloudberry database dump complete -- set enable_hashjoin=off; diff --git a/contrib/pax_storage/src/test/regress/sql/qp_with_clause.sql b/contrib/pax_storage/src/test/regress/sql/qp_with_clause.sql index ad018427f55..567de4f00d1 100644 --- a/contrib/pax_storage/src/test/regress/sql/qp_with_clause.sql +++ b/contrib/pax_storage/src/test/regress/sql/qp_with_clause.sql @@ -10329,7 +10329,7 @@ WHERE e.deptno = dc1.deptno AND m.deptno = dmc1.dept_mgr_no ORDER BY 1, 2, 3, 4 DESC LIMIT 25; --- Test that SharedInputScan within the same slice is always executed +-- Test that ShareInputScan within the same slice is always executed set gp_cte_sharing=on; -- start_ignore diff --git a/contrib/pax_storage/src/test/regress/sql/query_finish_pending.sql b/contrib/pax_storage/src/test/regress/sql/query_finish_pending.sql index 220fd4c35e5..e2d0a186d0c 100644 --- a/contrib/pax_storage/src/test/regress/sql/query_finish_pending.sql +++ b/contrib/pax_storage/src/test/regress/sql/query_finish_pending.sql @@ -38,7 +38,7 @@ reset enable_parallel; select gp_inject_fault('execsort_sort_bounded_heap', 'status', 2); --- test if shared input scan deletes memory correctly when QueryFinishPending and its child has been eagerly freed, +-- test if Share Input Scan deletes memory correctly when QueryFinishPending and its child has been eagerly freed, -- where the child is a Sort node drop table if exists testsisc; create table testsisc (i1 int, i2 int, i3 int, i4 int); @@ -48,13 +48,13 @@ insert into testsisc select i, i % 1000, i % 100000, i % 75 from set gp_resqueue_print_operator_memory_limits=on; set statement_mem='2MB'; --- ORCA does not generate SharedInputScan with a Sort node underneath it. For +-- ORCA does not generate ShareInputScan with a Sort node underneath it. For -- the following query, ORCA disregards the order by inside the cte definition; -- planner on the other hand does not. set optimizer=off; select gp_inject_fault('execshare_input_next', 'reset', 2); --- Set QueryFinishPending to true after SharedInputScan has retrieved the first tuple. --- This will eagerly free the memory context of shared input scan's child node. +-- Set QueryFinishPending to true after ShareInputScan has retrieved the first tuple. +-- This will eagerly free the memory context of Share Input Scan's child node. select gp_inject_fault('execshare_input_next', 'finish_pending', 2); set enable_parallel = off; @@ -64,13 +64,13 @@ select * from cte c1, cte c2 limit 2; select gp_inject_fault('execshare_input_next', 'status', 2); --- test if shared input scan deletes memory correctly when QueryFinishPending and its child has been eagerly freed, +-- test if Share Input Scan deletes memory correctly when QueryFinishPending and its child has been eagerly freed, -- where the child is a Sort node and sort_mk algorithm is used select gp_inject_fault('execshare_input_next', 'reset', 2); --- Set QueryFinishPending to true after SharedInputScan has retrieved the first tuple. --- This will eagerly free the memory context of shared input scan's child node. +-- Set QueryFinishPending to true after ShareInputScan has retrieved the first tuple. +-- This will eagerly free the memory context of Share Input Scan's child node. select gp_inject_fault('execshare_input_next', 'finish_pending', 2); with cte as (select i2 from testsisc order by i2) diff --git a/contrib/pax_storage/src/test/regress/sql/rowsecurity.sql b/contrib/pax_storage/src/test/regress/sql/rowsecurity.sql index 648816bbf22..ba74fed3c00 100644 --- a/contrib/pax_storage/src/test/regress/sql/rowsecurity.sql +++ b/contrib/pax_storage/src/test/regress/sql/rowsecurity.sql @@ -8,7 +8,7 @@ SET client_min_messages TO 'warning'; SET gp_enable_relsize_collection to on; -- Pax filter will call the f_leak, then output is not right -set pax_enable_sparse_filter to off; +set pax.enable_sparse_filter to off; DROP USER IF EXISTS regress_rls_alice; DROP USER IF EXISTS regress_rls_bob; @@ -1901,4 +1901,4 @@ CREATE POLICY p1 ON rls_tbl_force USING (c1 = 5) WITH CHECK (c1 < 5); CREATE POLICY p2 ON rls_tbl_force FOR SELECT USING (c1 = 8); CREATE POLICY p3 ON rls_tbl_force FOR UPDATE USING (c1 = 8) WITH CHECK (c1 >= 5); CREATE POLICY p4 ON rls_tbl_force FOR DELETE USING (c1 = 8); -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/src/test/regress/sql/select_distinct.sql b/contrib/pax_storage/src/test/regress/sql/select_distinct.sql index 27b63e699a0..35facaa9ca3 100644 --- a/contrib/pax_storage/src/test/regress/sql/select_distinct.sql +++ b/contrib/pax_storage/src/test/regress/sql/select_distinct.sql @@ -1,6 +1,9 @@ -- -- SELECT_DISTINCT -- +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- -- awk '{print $3;}' onek.data | sort -n | uniq diff --git a/contrib/pax_storage/src/test/regress/sql/select_views.sql b/contrib/pax_storage/src/test/regress/sql/select_views.sql index 0ced30cea91..8270b99d44a 100644 --- a/contrib/pax_storage/src/test/regress/sql/select_views.sql +++ b/contrib/pax_storage/src/test/regress/sql/select_views.sql @@ -2,7 +2,7 @@ -- SELECT_VIEWS -- test the views defined in CREATE_VIEWS -- -set pax_enable_sparse_filter to off; +set pax.enable_sparse_filter to off; SELECT * FROM street; SELECT name, #thepath FROM iexit ORDER BY name COLLATE "C", 2; @@ -167,4 +167,4 @@ EXECUTE p2; -- To be perform as a view without security-barrier -- Cleanup. RESET SESSION AUTHORIZATION; DROP ROLE regress_alice; -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/src/test/regress/sql/shared_scan.sql b/contrib/pax_storage/src/test/regress/sql/shared_scan.sql index 7234cef6e4a..d37eca9cdce 100644 --- a/contrib/pax_storage/src/test/regress/sql/shared_scan.sql +++ b/contrib/pax_storage/src/test/regress/sql/shared_scan.sql @@ -2,6 +2,9 @@ -- Queries that lead to hanging (not dead lock) when we don't handle synchronization properly in shared scan -- Queries that lead to wrong result when we don't finish executing the subtree below the shared scan being squelched. -- +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore CREATE SCHEMA shared_scan; diff --git a/contrib/pax_storage/src/test/regress/sql/stats_ext.sql b/contrib/pax_storage/src/test/regress/sql/stats_ext.sql index a6d1362da77..2fa5a327e3f 100644 --- a/contrib/pax_storage/src/test/regress/sql/stats_ext.sql +++ b/contrib/pax_storage/src/test/regress/sql/stats_ext.sql @@ -6,7 +6,7 @@ -- roughly the same estimates as in upstream. set gp_selectivity_damping_for_scans = off; set gp_selectivity_damping_for_joins = off; -set pax_enable_sparse_filter to off; +set pax.enable_sparse_filter to off; -- Set constraint_exclusion to partition as same as upstream to make test pass. set constraint_exclusion to 'partition'; @@ -1644,4 +1644,4 @@ DROP FUNCTION op_leak(int, int); RESET SESSION AUTHORIZATION; DROP SCHEMA tststats CASCADE; DROP USER regress_stats_user1; -reset pax_enable_sparse_filter; +reset pax.enable_sparse_filter; diff --git a/contrib/pax_storage/src/test/regress/sql/tidrangescan.sql b/contrib/pax_storage/src/test/regress/sql/tidrangescan.sql index 562ec68a792..0f7a99a040c 100644 --- a/contrib/pax_storage/src/test/regress/sql/tidrangescan.sql +++ b/contrib/pax_storage/src/test/regress/sql/tidrangescan.sql @@ -97,7 +97,7 @@ DECLARE c SCROLL CURSOR FOR SELECT ctid FROM tidrangescan WHERE ctid < '(1,0)'; FETCH NEXT c; FETCH NEXT c; --start_ignore -/* backward scan is not supported in this version of Cloudberry Database */ +/* backward scan is not supported in this version of Apache Cloudberry */ /* FETCH PRIOR c; FETCH FIRST c; diff --git a/contrib/pax_storage/src/test/regress/sql/window.sql b/contrib/pax_storage/src/test/regress/sql/window.sql index 6fd72f478e4..3ef1bc824c6 100644 --- a/contrib/pax_storage/src/test/regress/sql/window.sql +++ b/contrib/pax_storage/src/test/regress/sql/window.sql @@ -2,6 +2,9 @@ -- wrong result for some core case. Turn it on to run the existing tests -- and minimize the difference from upstream. set enable_incremental_sort=on; +-- start_ignore +set gp_use_streaming_hashagg = off; +-- end_ignore -- -- WINDOW FUNCTIONS diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index 7b41872634e..de77aac68b0 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -17,9 +17,24 @@ from pg_buffercache_summary(); t | t | t (1 row) +SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; + ?column? +---------- + t +(1 row) + -- Check that the functions / views can't be accessed by default. To avoid -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; +SELECT * FROM pg_buffercache; +ERROR: permission denied for view pg_buffercache +SELECT * FROM pg_buffercache_pages() AS p (wrong int); +ERROR: permission denied for function pg_buffercache_pages +SELECT * FROM pg_buffercache_summary(); +ERROR: permission denied for function pg_buffercache_summary +SELECT * FROM pg_buffercache_usage_counts(); +ERROR: permission denied for function pg_buffercache_usage_counts +RESET role; SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; ?column? ---------- @@ -72,17 +87,7 @@ SELECT count(*) > 0 FROM gp_buffercache_usage_counts_aggregated WHERE buffers >= (1 row) -- Check that the functions / views can't be accessed by default. -CREATE ROLE buffercache_test; -SET ROLE buffercache_test; -SELECT * FROM pg_buffercache; -ERROR: permission denied for view pg_buffercache -SELECT * FROM pg_buffercache_pages() AS p (wrong int); -ERROR: permission denied for function pg_buffercache_pages -SELECT * FROM pg_buffercache_summary(); -ERROR: permission denied for function pg_buffercache_summary -RESET role; -SELECT * FROM pg_buffercache_usage_counts(); -ERROR: permission denied for function pg_buffercache_usage_counts +SET ROLE pg_database_owner; -- GPDB SELECT * FROM pg_buffercache_summary; ERROR: permission denied for view pg_buffercache_summary @@ -166,4 +171,3 @@ SELECT count(*) > 0 FROM gp_buffercache_usage_counts_aggregated; (1 row) RESET ROLE; -DROP ROLE buffercache_test; diff --git a/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql b/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql index 8f212dc5e93..d5aebf3ba39 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql @@ -12,6 +12,17 @@ CREATE FUNCTION pg_buffercache_summary( AS 'MODULE_PATHNAME', 'pg_buffercache_summary' LANGUAGE C PARALLEL SAFE; +CREATE FUNCTION pg_buffercache_usage_counts( + OUT usage_count int4, + OUT buffers int4, + OUT dirty int4, + OUT pinned int4) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_buffercache_usage_counts' +LANGUAGE C PARALLEL SAFE; + -- Don't want these to be available to public. REVOKE ALL ON FUNCTION pg_buffercache_summary() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_buffercache_summary() TO pg_monitor; +REVOKE ALL ON FUNCTION pg_buffercache_usage_counts() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_buffercache_usage_counts() TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 8e0807d6265..e91f35599f0 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -18,6 +18,7 @@ #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 #define NUM_BUFFERCACHE_PAGES_ELEM 9 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 +#define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 PG_MODULE_MAGIC; @@ -61,6 +62,7 @@ typedef struct */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); PG_FUNCTION_INFO_V1(pg_buffercache_summary); +PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); Datum pg_buffercache_pages(PG_FUNCTION_ARGS) @@ -304,3 +306,44 @@ pg_buffercache_summary(PG_FUNCTION_ARGS) PG_RETURN_DATUM(result); } + +Datum +pg_buffercache_usage_counts(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int usage_counts[BM_MAX_USAGE_COUNT + 1] = {0}; + int dirty[BM_MAX_USAGE_COUNT + 1] = {0}; + int pinned[BM_MAX_USAGE_COUNT + 1] = {0}; + Datum values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM]; + bool nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0}; + + InitMaterializedSRF(fcinfo, 0); + + for (int i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state = pg_atomic_read_u32(&bufHdr->state); + int usage_count; + + usage_count = BUF_STATE_GET_USAGECOUNT(buf_state); + usage_counts[usage_count]++; + + if (buf_state & BM_DIRTY) + dirty[usage_count]++; + + if (BUF_STATE_GET_REFCOUNT(buf_state) > 0) + pinned[usage_count]++; + } + + for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++) + { + values[0] = Int32GetDatum(i); + values[1] = Int32GetDatum(usage_counts[i]); + values[2] = Int32GetDatum(dirty[i]); + values[3] = Int32GetDatum(pinned[i]); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } + + return (Datum) 0; +} diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 5d2beecd707..e9091c9e38c 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -10,12 +10,15 @@ select buffers_used + buffers_unused > 0, buffers_pinned <= buffers_used from pg_buffercache_summary(); +SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; + -- Check that the functions / views can't be accessed by default. To avoid -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; SELECT * FROM pg_buffercache; SELECT * FROM pg_buffercache_pages() AS p (wrong int); SELECT * FROM pg_buffercache_summary(); +SELECT * FROM pg_buffercache_usage_counts(); RESET role; SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; @@ -42,12 +45,7 @@ SELECT count(*) > 0 FROM gp_buffercache_usage_counts WHERE buffers >= 0; SELECT count(*) > 0 FROM gp_buffercache_usage_counts_aggregated WHERE buffers >= 0; -- Check that the functions / views can't be accessed by default. -CREATE ROLE buffercache_test; -SET ROLE buffercache_test; -SELECT * FROM pg_buffercache; -SELECT * FROM pg_buffercache_pages() AS p (wrong int); -SELECT * FROM pg_buffercache_summary(); -SELECT * FROM pg_buffercache_usage_counts(); +SET ROLE pg_database_owner; -- GPDB SELECT * FROM pg_buffercache_summary; SELECT * FROM pg_buffercache_usage_counts; @@ -73,5 +71,3 @@ SELECT buffers_used + buffers_unused > 0 FROM gp_buffercache_summary_aggregated; SELECT count(*) > 0 FROM gp_buffercache_usage_counts; SELECT count(*) > 0 FROM gp_buffercache_usage_counts_aggregated; RESET ROLE; - -DROP ROLE buffercache_test; diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 3cd6221d001..b125bcca694 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -313,7 +313,8 @@ static void pgss_post_parse_analyze(ParseState *pstate, Query *query, static PlannedStmt *pgss_planner(Query *parse, const char *query_string, int cursorOptions, - ParamListInfo boundParams); + ParamListInfo boundParams, + OptimizerOptions *optimizer_options); static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags); static void pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, @@ -866,7 +867,8 @@ static PlannedStmt * pgss_planner(Query *parse, const char *query_string, int cursorOptions, - ParamListInfo boundParams) + ParamListInfo boundParams, + OptimizerOptions *optimizer_options) { PlannedStmt *result; @@ -908,10 +910,10 @@ pgss_planner(Query *parse, { if (prev_planner_hook) result = prev_planner_hook(parse, query_string, cursorOptions, - boundParams); + boundParams, optimizer_options); else result = standard_planner(parse, query_string, cursorOptions, - boundParams); + boundParams, optimizer_options); } PG_FINALLY(); { @@ -945,10 +947,10 @@ pgss_planner(Query *parse, { if (prev_planner_hook) result = prev_planner_hook(parse, query_string, cursorOptions, - boundParams); + boundParams, optimizer_options); else result = standard_planner(parse, query_string, cursorOptions, - boundParams); + boundParams, optimizer_options); } return result; diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out index 4542d692b55..c8542e75416 100644 --- a/contrib/pgstattuple/expected/pgstattuple.out +++ b/contrib/pgstattuple/expected/pgstattuple.out @@ -152,6 +152,8 @@ select pgstatginindex('test_hashidx'); ERROR: relation "test_hashidx" is not a GIN index -- check that using any of these functions with unsupported relations will fail create table test_partitioned (a int) partition by range (a); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. create index test_partitioned_index on test_partitioned(a); -- these should all fail select pgstattuple('test_partitioned'); @@ -159,9 +161,9 @@ ERROR: "test_partitioned" (partitioned table) is not supported select pgstattuple('test_partitioned_index'); ERROR: "test_partitioned_index" (partitioned index) is not supported select pgstattuple_approx('test_partitioned'); -ERROR: "test_partitioned" is not a table, materialized view, or TOAST table +ERROR: "test_partitioned" is not a table, directory table, materialized view, or TOAST table select pg_relpages('test_partitioned'); -ERROR: "test_partitioned" is not a table, index, materialized view, sequence, or TOAST table +ERROR: "test_partitioned" is not a table, directory table, index, materialized view, sequence, or TOAST table select pgstatindex('test_partitioned'); ERROR: relation "test_partitioned" is not a btree index select pgstatginindex('test_partitioned'); @@ -173,9 +175,9 @@ create view test_view as select 1; select pgstattuple('test_view'); ERROR: "test_view" (view) is not supported select pgstattuple_approx('test_view'); -ERROR: "test_view" is not a table, materialized view, or TOAST table +ERROR: "test_view" is not a table, directory table, materialized view, or TOAST table select pg_relpages('test_view'); -ERROR: "test_view" is not a table, index, materialized view, sequence, or TOAST table +ERROR: "test_view" is not a table, directory table, index, materialized view, sequence, or TOAST table select pgstatindex('test_view'); ERROR: relation "test_view" is not a btree index select pgstatginindex('test_view'); @@ -189,9 +191,9 @@ create foreign table test_foreign_table () server dummy_server; select pgstattuple('test_foreign_table'); ERROR: "test_foreign_table" (foreign table) is not supported select pgstattuple_approx('test_foreign_table'); -ERROR: "test_foreign_table" is not a table, materialized view, or TOAST table +ERROR: "test_foreign_table" is not a table, directory table, materialized view, or TOAST table select pg_relpages('test_foreign_table'); -ERROR: "test_foreign_table" is not a table, index, materialized view, sequence, or TOAST table +ERROR: "test_foreign_table" is not a table, directory table, index, materialized view, sequence, or TOAST table select pgstatindex('test_foreign_table'); ERROR: relation "test_foreign_table" is not a btree index select pgstatginindex('test_foreign_table'); @@ -200,6 +202,7 @@ select pgstathashindex('test_foreign_table'); ERROR: "test_foreign_table" is not an index -- a partition of a partitioned table should work though create table test_partition partition of test_partitioned for values from (1) to (100); +NOTICE: table has parent, setting distribution columns to match parent table select pgstattuple('test_partition'); pgstattuple --------------------- diff --git a/contrib/udp2/CMakeLists.txt b/contrib/udp2/CMakeLists.txt new file mode 100644 index 00000000000..9a4835a1b53 --- /dev/null +++ b/contrib/udp2/CMakeLists.txt @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cmake_minimum_required(VERSION 3.11.0) +project(udp2) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +# Get the top-level project directory +set(TOP_DIR ${PROJECT_SOURCE_DIR}/../..) +set(CBDB_INCLUDE_DIR ${TOP_DIR}/src/include) + +# CMAKE_INSTALL_PREFIX should be set by the calling Makefile +# If not set, we'll use a reasonable default but warn about it +if(NOT DEFINED CMAKE_INSTALL_PREFIX OR CMAKE_INSTALL_PREFIX STREQUAL "/usr/local") + message(WARNING "CMAKE_INSTALL_PREFIX not set by parent build system, using default") + set(CMAKE_INSTALL_PREFIX "/usr/local" CACHE PATH "Install prefix" FORCE) +endif() + +# Check for debug/release configuration from main project +include(CheckSymbolExists) +set(PG_CONFIG_HEADER_FILE "${CBDB_INCLUDE_DIR}/pg_config.h") +if(EXISTS "${PG_CONFIG_HEADER_FILE}") + CHECK_SYMBOL_EXISTS(USE_ASSERT_CHECKING "${PG_CONFIG_HEADER_FILE}" UDP2_USE_DEBUG) + if(UDP2_USE_DEBUG) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Choose the type of build." FORCE) + message(STATUS "Setting CMAKE_BUILD_TYPE to 'Debug' based on main project configuration") + else() + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) + message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' based on main project configuration") + endif() +else() + # Fallback to Release if pg_config.h is not found + if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + message(STATUS "Setting default CMAKE_BUILD_TYPE to 'Release'") + endif() +endif() + +# First, build and install ic_common as a subdirectory +add_subdirectory(ic_common) + +# Set up include directories +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_INSTALL_PREFIX}/include/postgresql/) +include_directories(${CMAKE_INSTALL_PREFIX}/include/postgresql/udp2/) +include_directories(${CBDB_INCLUDE_DIR}) +include_directories(${CBDB_INCLUDE_DIR}/server) + +# Set up library directories +link_directories(${CMAKE_INSTALL_PREFIX}/lib/postgresql/) + +# Source files for udp2 module +set(UDP2_SOURCES + ic_udp2.c + ic_modules.c +) + +# Create the udp2 shared library +add_library(udp2 SHARED ${UDP2_SOURCES}) + +# Set compiler flags consistent with main project +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_GNU_SOURCE -Wall -Wpointer-arith -Wendif-labels -Wmissing-format-attribute -Wimplicit-fallthrough=3 -Wcast-function-type -Wformat-security -fno-strict-aliasing -fwrapv") + +if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -ggdb") + message(STATUS "Building udp2 in Debug mode") +elseif (${CMAKE_BUILD_TYPE} STREQUAL "Release") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS} -O2 -DNDEBUG") + message(STATUS "Building udp2 in Release mode") +endif() + +# Link against ic_common library +target_link_libraries(udp2 ic_common) + +# Make sure ic_common is built before udp2 +add_dependencies(udp2 ic_common) + +# Set output name and remove lib prefix +set_target_properties(udp2 PROPERTIES + OUTPUT_NAME "udp2" + PREFIX "" +) + +# Install the udp2 library +install(TARGETS udp2 + LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/postgresql/" + ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/postgresql/" +) + +# Install udp2 headers +install(FILES + "${CMAKE_CURRENT_SOURCE_DIR}/ic_udp2.h" + "${CMAKE_CURRENT_SOURCE_DIR}/ic_modules.h" + DESTINATION "${CMAKE_INSTALL_PREFIX}/include/postgresql/" +) diff --git a/contrib/udp2/Makefile b/contrib/udp2/Makefile new file mode 100644 index 00000000000..e4891e9a528 --- /dev/null +++ b/contrib/udp2/Makefile @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# contrib/udp2/Makefile + +top_builddir = ../.. + +# Include the standard PostgreSQL build system to get variables like prefix, DESTDIR, etc. +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/udp2 +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +# Use CMake for building this module +CMAKE_BUILD_DIR = build + +# Default target +all: $(CMAKE_BUILD_DIR)/udp2.so + +# Create build directory and configure with CMake +$(CMAKE_BUILD_DIR)/Makefile: + @echo "Configuring udp2 with CMake..." + @mkdir -p $(CMAKE_BUILD_DIR) + @cd $(CMAKE_BUILD_DIR) && cmake -DCMAKE_INSTALL_PREFIX=$(DESTDIR)$(prefix) .. + +# Build the project using CMake +$(CMAKE_BUILD_DIR)/udp2.so: $(CMAKE_BUILD_DIR)/Makefile + @echo "Building udp2 with CMake..." + @cd $(CMAKE_BUILD_DIR) && $(MAKE) + @cp $(CMAKE_BUILD_DIR)/udp2.so udp2.so + +# Install target +install: $(CMAKE_BUILD_DIR)/udp2.so + @echo "Installing udp2..." + @cd $(CMAKE_BUILD_DIR) && $(MAKE) install + +# Clean target +clean: + @echo "Cleaning udp2..." + @rm -rf $(CMAKE_BUILD_DIR) + @rm -f udp2.so + +# Ensure ic_common is built first +ic_common: + @echo "Building ic_common..." + @mkdir -p ic_common/build + @cd ic_common/build && cmake -DCMAKE_INSTALL_PREFIX=$(DESTDIR)$(prefix) .. && $(MAKE) && $(MAKE) install + +# Make sure ic_common is built before udp2 +$(CMAKE_BUILD_DIR)/udp2.so: ic_common + +.PHONY: all install clean ic_common diff --git a/contrib/udp2/README.md b/contrib/udp2/README.md new file mode 100644 index 00000000000..9cfe9f9c989 --- /dev/null +++ b/contrib/udp2/README.md @@ -0,0 +1,247 @@ + + +# UDP2 Interconnect Protocol Implementation + +## Project Background + +UDP2 is a next-generation interconnect protocol implementation based on the original UDP protocol, located in the `contrib/udp2` directory. In CloudBerry Database, the interconnect is responsible for data transmission and synchronization between nodes, serving as a core component for distributed query execution. + +Currently, the database supports three interconnect protocol implementations: +- **TCP** (`contrib/interconnect/tcp`) - Reliable transmission based on TCP protocol +- **UDP** (`contrib/interconnect/udp`) - High-performance transmission based on UDP protocol +- **Proxy** (`contrib/interconnect/proxy`) - Proxy-based transmission + +UDP2 is an architectural refactoring based on the original UDP protocol implementation, aimed at achieving complete separation between interconnect and the database kernel. + +## Project Goals + +The core objectives of the UDP2 protocol implementation are: + +1. **Architecture Decoupling**: Completely separate the interconnect protocol implementation from the database kernel, enabling independent development and evolution +2. **Independent Testing**: Enable end-to-end functional and performance testing of interconnect without depending on the database kernel +3. **Rapid Diagnosis**: Quickly identify whether issues are at the interconnect level or database kernel level +4. **Modular Design**: Provide clear interface boundaries for easier extension and maintenance + +## Current Project Implementation Architecture + +### Overall Architecture Design + +UDP2 adopts a layered architecture design, primarily divided into two layers: + +``` +┌─────────────────────────────────────────────────────────┐ +│ Database Kernel Layer │ +│ ┌────────────────────────────────────────────────────┐ │ +│ │ contrib/udp2/ │ │ +│ │ ┌─────────────────┐ ┌────────────────────────┐ │ │ +│ │ │ ic_modules.c │ │ ic_udp2.c │ │ │ +│ │ │ ic_modules.h │ │ ic_udp2.h │ │ │ +│ │ └─────────────────┘ └────────────────────────┘ │ │ +│ │ Adapter Layer (Database Adapter) │ │ +│ └────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ + │ + │ C/C++ Interface + ▼ +┌──────────────────────────────────────────────────────────┐ +│ Independent IC Communication Library │ +│ ┌─────────────────────────────────────────────────────┐ │ +│ │ contrib/udp2/ic_common/ │ │ +│ │ ┌─────────────────┐ ┌─────────────────────────┐ │ │ +│ │ │ ic_types.h │ │ ic_utility.hpp │ │ │ +│ │ │ ic_except.hpp │ │ ic_faultinjection.h │ │ │ +│ │ └─────────────────┘ └─────────────────────────┘ │ │ +│ │ ┌────────────────────────────────────────────────┐ │ │ +│ │ │ contrib/udp2/ic_common/udp2/ │ │ │ +│ │ │ ┌─────────────────┐ ┌─────────────────────┐ │ │ │ +│ │ │ │ ic_udp2.h │ │ ic_udp2.hpp │ │ │ │ +│ │ │ │ ic_udp2.cpp │ │ic_udp2_internal.hpp │ │ │ │ +│ │ │ └─────────────────┘ └─────────────────────┘ │ │ │ +│ │ └────────────────────────────────────────────────┘ │ │ +│ │ Core Communication Library │ │ +│ └─────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────┘ +``` + +### Core Component Description + +#### 1. Adapter Layer (`contrib/udp2/`) +- **ic_modules.c/h**: Module registration and initialization, implementing the `MotionIPCLayer` interface +- **ic_udp2.c/h**: Adapter layer between database kernel and ic_common library +- Responsible for converting database kernel data structures to ic_common library standard interfaces + +#### 2. Core Communication Library (`contrib/udp2/ic_common/`) +- **ic_types.h**: Defines core data types and interfaces, decoupled from database kernel +- **ic_utility.hpp**: Common utility functions and logging system +- **ic_except.hpp**: Exception handling mechanism +- **ic_faultinjection.h**: Fault injection testing support + +#### 3. UDP2 Protocol Implementation (`contrib/udp2/ic_common/udp2/`) +- **ic_udp2.h**: C language interface definition +- **ic_udp2.hpp**: C++ interface definition +- **ic_udp2.cpp**: Core protocol implementation +- **ic_udp2_internal.hpp**: Internal implementation details + +### Build System + +UDP2 uses CMake build system with support for independent compilation: + +``` +contrib/udp2/ +├── CMakeLists.txt # Main build configuration +├── Makefile # PostgreSQL-compatible Makefile +└── ic_common/ + ├── CMakeLists.txt # ic_common library build configuration + └── build/ # Build output directory +``` + +Build process: +1. First build the `ic_common` dynamic library (`libic_common.so`) +2. Then build the `udp2` module (`udp2.so`), linking against the `ic_common` library + +## How to Switch Database to This Protocol Implementation + +### Enable UDP2 Support at Compile Time + +1. **Configure compilation options**: +```bash +./configure --enable-ic-udp2 [other options] +make && make install +``` + +2. **Verify compilation results**: +```bash +# Check if udp2.so is generated +ls -la $GPHOME/lib/postgresql/udp2.so + +# Check if ic_common library is installed +ls -la $GPHOME/lib/libic_common.so +``` + +### Runtime Configuration + +```bash +# Set cluster to use UDP2 by default +gpconfig -c gp_interconnect_type -v udp2 + +# Reload configuration +gpstop -air +``` + +```sql +-- Check current interconnect type +SHOW gp_interconnect_type; +``` + +## Technical Details + +### Interface Design + +UDP2 achieves decoupling between database kernel and communication library through standardized C interfaces: + +```c +// Core interface functions (ic_common/udp2/ic_udp2.h) +extern ICChunkTransportState* UDP2_SetupUDP(ICSliceTable *sliceTable, + SessionMotionLayerIPCParam *param); +extern void UDP2_TeardownUDP(ICChunkTransportState *transportStates, bool hasErrors); + +// Data send/receive interfaces +extern bool UDP2_SendData(ICChunkTransportState *transportStates, + int16 motNodeID, int16 targetRoute, + DataBlock *pblocks, int num, bool broadcast); +extern void UDP2_RecvAny(ICChunkTransportState *transportStates, + int16 motNodeID, int16 *srcRoute, + GetDataLenInPacket getLen, DataBlock *data); +``` + +### Data Structure Mapping + +UDP2 defines lightweight data structures to replace complex database kernel structures: + +```c +// Lightweight process information (replaces CdbProcess) +typedef struct ICCdbProcess { + bool valid; + char *listenerAddr; + int listenerPort; + int pid; + int contentid; + int dbid; +} ICCdbProcess; + +// Lightweight slice information (replaces ExecSlice) +typedef struct ICExecSlice { + int sliceIndex; + int parentIndex; + int numChildren; + int *children; + int numSegments; + int numPrimaryProcesses; + ICCdbProcess *primaryProcesses; +} ICExecSlice; +``` + +### Error Handling Mechanism + +UDP2 implements a unified error handling mechanism: + +```c +typedef enum ErrorLevel { + LEVEL_OK, + LEVEL_ERROR, + LEVEL_FATAL, +} ErrorLevel; + +// Error handling interfaces +extern void SetLastError(ErrorLevel level, const char *msg); +extern ICError* GetLastError(); +extern void ResetLastError(); +``` + +## Development and Debugging + +### Independent Compilation Testing + +```bash +# Enter ic_common directory +cd contrib/udp2/ic_common + +# Create build directory +mkdir build && cd build + +# Configure and compile +cmake -DCMAKE_BUILD_TYPE=Debug .. +make -j +``` + +### Debug Configuration + +Enable verbose logging in development environment: + +```sql +-- Enable interconnect debug logging +SET gp_log_interconnect = 'debug'; + +-- Set log level +SET log_min_messages = 'debug1'; + +-- Enable detailed error information +SET gp_interconnect_log_stats = on; +``` diff --git a/contrib/udp2/ic_common/CMakeLists.txt b/contrib/udp2/ic_common/CMakeLists.txt new file mode 100644 index 00000000000..2c26e002b84 --- /dev/null +++ b/contrib/udp2/ic_common/CMakeLists.txt @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cmake_minimum_required(VERSION 3.11.0) +project(ic_common) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +# Get the top-level project directory +set(TOP_DIR ${PROJECT_SOURCE_DIR}/../../..) +set(CBDB_INCLUDE_DIR ${TOP_DIR}/src/include) + +# CMAKE_INSTALL_PREFIX should be set by the calling Makefile +# If not set, we'll use a reasonable default but warn about it +if(NOT DEFINED CMAKE_INSTALL_PREFIX OR CMAKE_INSTALL_PREFIX STREQUAL "/usr/local") + message(WARNING "CMAKE_INSTALL_PREFIX not set by parent build system, using default") + set(CMAKE_INSTALL_PREFIX "/usr/local" CACHE PATH "Install prefix" FORCE) +endif() + +# Check for debug/release configuration from main project +include(CheckSymbolExists) +set(PG_CONFIG_HEADER_FILE "${CBDB_INCLUDE_DIR}/pg_config.h") +if(EXISTS "${PG_CONFIG_HEADER_FILE}") + CHECK_SYMBOL_EXISTS(USE_ASSERT_CHECKING "${PG_CONFIG_HEADER_FILE}" IC_USE_DEBUG) + if(IC_USE_DEBUG) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Choose the type of build." FORCE) + message(STATUS "Setting CMAKE_BUILD_TYPE to 'Debug' based on main project configuration") + else() + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) + message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' based on main project configuration") + endif() +else() + # Fallback to Release if pg_config.h is not found + if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + message(STATUS "Setting default CMAKE_BUILD_TYPE to 'Release'") + endif() +endif() + +file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/udp2/*.cpp") + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/udp2) + +add_library(ic_common SHARED ${SOURCES}) + +set_target_properties(ic_common PROPERTIES OUTPUT_NAME "ic_common") + +# Set compiler flags consistent with main project +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GNU_SOURCE -Wall -Wpointer-arith -Wendif-labels -Wmissing-format-attribute -Wimplicit-fallthrough=3 -Wcast-function-type -Wformat-security -fno-strict-aliasing -fwrapv") + +if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -ggdb") + message(STATUS "Building ic_common in Debug mode") +elseif (${CMAKE_BUILD_TYPE} STREQUAL "Release") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O2 -DNDEBUG") + message(STATUS "Building ic_common in Release mode") +endif() + +# Install headers to the main project's include directory +install(FILES + "${CMAKE_CURRENT_SOURCE_DIR}/ic_types.h" + "${CMAKE_CURRENT_SOURCE_DIR}/ic_except.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/ic_utility.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/ic_faultinjection.h" + DESTINATION "${CMAKE_INSTALL_PREFIX}/include/postgresql/" +) + +install(FILES + "${CMAKE_CURRENT_SOURCE_DIR}/udp2/ic_udp2.h" + "${CMAKE_CURRENT_SOURCE_DIR}/udp2/ic_udp2.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/udp2/ic_udp2_internal.hpp" + DESTINATION "${CMAKE_INSTALL_PREFIX}/include/postgresql/udp2/" +) + +# Install library to the main project's lib directory +install(TARGETS ic_common + LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/" + ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/" +) diff --git a/contrib/udp2/ic_common/ic_except.hpp b/contrib/udp2/ic_common/ic_except.hpp new file mode 100644 index 00000000000..9761d499487 --- /dev/null +++ b/contrib/udp2/ic_common/ic_except.hpp @@ -0,0 +1,75 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_except.hpp + * + * IDENTIFICATION + * contrib/udp2/ic_common/ic_except.hpp + * + *------------------------------------------------------------------------- + */ +#ifndef IC_EXCEPT_HPP +#define IC_EXCEPT_HPP + +class ICException: public std::runtime_error { +public: + ICException(const std::string & arg, const char *file, int line): std::runtime_error(arg) { + std::stringstream ss; + ss << arg << " from " << file << ":" << line << std::endl; + detail = ss.str(); + } + ~ICException() throw() {} + + virtual const char *msg() const { + return detail.c_str(); + } + +protected: + std::string detail; +}; + +class ICFatalException: public ICException { +public: + ICFatalException(const std::string & arg, const char *file, int line): + ICException(arg, file, line) {} + ~ICFatalException() throw () {} +}; + +class ICInvalidIndex: public ICException { +public: + ICInvalidIndex(const std::string & arg, const char *file, int line): + ICException(arg, file, line) {} + ~ICInvalidIndex() throw () {} +}; + +class ICNetworkException: public ICException { +public: + ICNetworkException(const std::string & arg, const char *file, int line): + ICException(arg, file, line) {} + ~ICNetworkException() throw () {} +}; + +class ICReceiveThreadException: public ICException { +public: + ICReceiveThreadException(const std::string & arg, const char *file, int line): + ICException(arg, file, line) {} + ~ICReceiveThreadException() throw () {} +}; + +#endif // IC_EXCEPT_HPP \ No newline at end of file diff --git a/contrib/udp2/ic_common/ic_faultinjection.h b/contrib/udp2/ic_common/ic_faultinjection.h new file mode 100644 index 00000000000..b79cfed28dd --- /dev/null +++ b/contrib/udp2/ic_common/ic_faultinjection.h @@ -0,0 +1,620 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_faultinjection.h + * + * IDENTIFICATION + * contrib/udp2/ic_common/ic_faultinjection.h + * + *------------------------------------------------------------------------- + */ + +#ifndef IC_FAULTINJECTION_H +#define IC_FAULTINJECTION_H + +#ifdef HAVE_POLL_H +#include +#endif + +#ifdef HAVE_SYS_POLL_H +#include +#endif + +#ifdef USE_ASSERT_CHECKING + +static bool udp_testmode = false; + +static inline void +set_test_mode() +{ + if (session_param.gp_udpic_dropseg != UNDEF_SEGMENT + || session_param.gp_udpic_dropacks_percent != 0 + || session_param.gp_udpic_dropxmit_percent != 0 + || session_param.gp_udpic_fault_inject_percent != 0) + udp_testmode = true; + else + udp_testmode = false; +} + +/* + * testmode_inject_fault + * Return whether we inject a fault given a probability. + * + */ +static inline bool +testmode_inject_fault(int percent) +{ + if (udp_testmode && + (session_param.gp_udpic_dropseg == UNDEF_SEGMENT || session_param.gp_udpic_dropseg == global_param.segindex)) + { + if (random() % 100 < percent) + return true; + } + + return false; +} + +/* Track the malloc times */ +static pthread_mutex_t icudp_malloc_tracking_lock = PTHREAD_MUTEX_INITIALIZER; +static int64 icudp_malloc_times = 0; + +/* Fault type enumeration. */ +typedef enum { + /* These are used to inject packet content corruption. */ + FINC_PKT_HEADER_SHORTEN = 0, + FINC_PKT_PKT_SHORTEN = 1, + FINC_PKT_CRC_CORRUPT = 2, + FINC_PKT_HEADER_LEN_ZERO = 3, + FINC_PKT_HEADER_LEN_NEGATIVE = 4, + FINC_PKT_MISMATCH = 5, + + /* These are used to inject query cancel and process die. */ + FINC_INTR_QUERY_CANCEL = 12, + FINC_INTR_PROC_DIE = 13, + + /* These are used to inject OS API errors. */ + FINC_OS_EAGAIN = 16, + FINC_OS_EINTR = 17, + FINC_OS_EWOULDBLOCK = 18, + FINC_OS_NET_INTERFACE = 19, + FINC_OS_MEM_INTERFACE = 20, + FINC_OS_CREATE_THREAD = 21, + FINC_PKT_TOO_LONG = 22, + + /* These are used to inject network faults. */ + FINC_NET_RECV_ERROR = 23, + FINC_NET_PKT_DUP = 24, + FINC_NET_RECV_ZERO = 25, + + /* This is a fault which is used to introduce a specific null return of malloc in bg thread */ + FINC_RX_BUF_NULL = 29, + + /* The last guard item, don't put anything behind this one. */ + FINC_MAX_LIMITATION = 31, +} FAULT_INJECTION_TYPE; + +#define FINC_HAS_FAULT(type) (session_param.gp_udpic_fault_inject_bitmap & (1U << (type))) + +/* +* FAULT_TYPE_FUNCTION +* define the all test function name which uses fault injection test. +*/ +typedef enum { + FUNC_TESTMODE_RECVFROM, + FUNC_TESTMODE_MAX +} FAULT_TYPE_FUNCTION; + +/* +* fault_type_array_length +* record acutal fault type number for every test function which uses fault injection test. +*/ +static unsigned int fault_type_array_length[FUNC_TESTMODE_MAX][1] = { 0 }; + +/* +* fault_type_array +* define the fault type for for every test function which uses fault injection test. +*/ +static const unsigned int fault_type_array[][FINC_MAX_LIMITATION + 1] = { + { FINC_OS_EAGAIN, FINC_OS_EINTR, FINC_OS_EWOULDBLOCK, FINC_NET_RECV_ZERO, FINC_OS_NET_INTERFACE, FINC_NET_RECV_ERROR, FINC_MAX_LIMITATION } +}; + +/* + * testmode_check_interrupts + * ML_CHECK_FOR_INTERRUPTS in test mode with interrupts injected. + */ +static inline void +testmode_check_interrupts(const char *caller_name, bool teardownActive) +{ +} + +/* We needs a private copy to corrupt the packet. */ +#define FAULT_INJECT_BACKUP_PKT() \ +do { \ + pktModified = true; \ + memcpy(&hdrbk, (void *) buffer, sizeof(icpkthdr)); \ +} while (0) + +/* + * get_fault_type_array_length + * Retrive test function fault type size recorded in array fault_type_array_length. + */ +static inline unsigned int +get_fault_type_array_length(unsigned int func_name) +{ + if (!fault_type_array_length[func_name][0]) { + for (unsigned int i = 0; i < FINC_MAX_LIMITATION + 1; i++) { + if (fault_type_array[func_name][i] == FINC_MAX_LIMITATION) { + fault_type_array_length[func_name][0] = i; + break; + } + } + } + return fault_type_array_length[func_name][0]; +} + +/* + * random_with_range + * Generate the random value predefined in fault_type_array. + */ +static inline int +random_with_array(unsigned int func_name) +{ + Assert((func_name < FUNC_TESTMODE_MAX) && (fault_type_array!= NULL)); + unsigned int length = get_fault_type_array_length(func_name); + Assert((length != 0) && (length < FINC_MAX_LIMITATION)); + return fault_type_array[func_name][(rand() % length)]; +} + +/* + * testmode_sendto + * Many kinds of fault packets are injected in this function. + */ +static ssize_t +testmode_sendto(const char *caller_name, int socket, const void *buffer, + size_t length, int flags, const struct sockaddr *dest_addr, + socklen_t dest_len) +{ + int n; + int testmode_length = length; + size_t icpkthdr_size = sizeof(icpkthdr); + bool is_pkt = false; + bool pktModified = false; + int fault_type; + icpkthdr hdrbk; + icpkthdr *msg = (icpkthdr *) buffer; + + if (!testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + goto no_fault_inject; + + /* + * Generate a fault type. + */ + fault_type = random() % FINC_MAX_LIMITATION; + + /* Make sure we are modifying a packet. */ + if (length >= icpkthdr_size) + is_pkt = true; + + /* Inject a fault */ + switch (fault_type) + { + case FINC_PKT_HEADER_SHORTEN: + if (!FINC_HAS_FAULT(fault_type) || !is_pkt) + break; + testmode_length = icpkthdr_size - 1; + LOG(INFO, "inject fault to sendto: FINC_PKT_HEADER_SHORTEN"); + break; + + case FINC_PKT_PKT_SHORTEN: + if (!FINC_HAS_FAULT(fault_type) || !is_pkt) + break; + if (length > icpkthdr_size) + testmode_length--; + LOG(INFO, "inject fault to sendto: FINC_PKT_PKT_SHORTEN"); + break; + + case FINC_PKT_CRC_CORRUPT: + if (!FINC_HAS_FAULT(fault_type) || !is_pkt) + break; + FAULT_INJECT_BACKUP_PKT(); + if (!session_param.gp_interconnect_full_crc) + break; + msg->crc++; + LOG(INFO, "inject fault to sendto: FINC_PKT_CRC_CORRUPT"); + break; + + case FINC_PKT_HEADER_LEN_ZERO: + if (!FINC_HAS_FAULT(fault_type) || !is_pkt) + break; + FAULT_INJECT_BACKUP_PKT(); + msg->len = 0; + LOG(INFO, "inject fault to sendto: FINC_PKT_HEADER_LEN_ZERO"); + break; + + case FINC_PKT_HEADER_LEN_NEGATIVE: + if (!FINC_HAS_FAULT(fault_type) || !is_pkt) + break; + FAULT_INJECT_BACKUP_PKT(); + msg->len = -1; + LOG(INFO, "inject fault to sendto: FINC_PKT_HEADER_LEN_NEGATIVE"); + break; + + case FINC_PKT_MISMATCH: + if (!FINC_HAS_FAULT(fault_type) || !is_pkt) + break; + FAULT_INJECT_BACKUP_PKT(); + msg->srcPid = -1; /* There is no such pid. */ + msg->icId = 0; + msg->seq = 1; + LOG(INFO, "inject fault to sendto: FINC_PKT_MISMATCH"); + break; + + case FINC_OS_EAGAIN: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to sendto: FINC_OS_EAGAIN"); + errno = EAGAIN; + return -1; + + case FINC_OS_EINTR: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to sendto: FINC_OS_EINTR"); + errno = EINTR; + return -1; + + case FINC_NET_PKT_DUP: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to sendto: FINC_NET_PKT_DUP"); + if ((n = sendto(socket, buffer, testmode_length, flags, dest_addr, dest_len)) != (int)length) + return n; + break; + + case FINC_OS_NET_INTERFACE: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to sendto: FINC_OS_NET_INTERFACE"); + errno = EFAULT; + return -1; + + case FINC_PKT_TOO_LONG: + if (!FINC_HAS_FAULT(fault_type) || !is_pkt) + break; + LOG(INFO, "inject fault to sendto: FINC_PKT_TOO_LONG"); + errno = EMSGSIZE; + return -1; + + default: + break; + } + +no_fault_inject: + n = sendto(socket, buffer, testmode_length, flags, dest_addr, dest_len); + + if (pktModified) + memcpy((void *) buffer, &hdrbk, sizeof(icpkthdr)); + return n; +} + +/* + * testmode_recvfrom + * recvfrom function with faults injected. + */ +static ssize_t +testmode_recvfrom(const char *caller_name, int socket, void *buffer, + size_t length, int flags, struct sockaddr *address, + socklen_t *address_len) +{ + int fault_type; + + if (!testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + goto no_fault_inject; + + fault_type = random_with_array(FUNC_TESTMODE_RECVFROM); + + switch (fault_type) + { + case FINC_OS_EAGAIN: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to recvfrom: FINC_OS_EAGAIN"); + errno = EAGAIN; + return -1; + + case FINC_OS_EINTR: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to recvfrom: FINC_OS_EINTR"); + errno = EINTR; + return -1; + + case FINC_OS_EWOULDBLOCK: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to recvfrom: FINC_OS_EWOULDBLOCK"); + errno = EWOULDBLOCK; + return -1; + + case FINC_NET_RECV_ZERO: + if (!FINC_HAS_FAULT(fault_type)) + break; + + memset(buffer, 0, length); + LOG(INFO, "inject fault to recvfrom: FINC_NET_RECV_ZERO"); + return 0; + + case FINC_OS_NET_INTERFACE: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to recvfrom: FINC_OS_NET_INTERFACE"); + errno = EFAULT; + return -1; + + case FINC_NET_RECV_ERROR: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to recvfrom: FINC_NET_RECV_ERROR"); + errno = EFAULT; + return -1; + + default: + break; + } + +no_fault_inject: + return recvfrom(socket, buffer, length, flags, address, address_len); +} + +/* + * testmode_poll + * poll function with faults injected. + */ +static int +testmode_poll(const char *caller_name, struct pollfd fds[], nfds_t nfds, + int timeout) +{ + int fault_type; + + if (!testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + goto no_fault_inject; + + fault_type = random() % FINC_MAX_LIMITATION; + + switch (fault_type) + { + case FINC_OS_EINTR: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to poll: FINC_OS_EINTR"); + errno = EINTR; + return -1; + + case FINC_OS_NET_INTERFACE: + if (!FINC_HAS_FAULT(fault_type)) + break; + LOG(INFO, "inject fault to poll: FINC_OS_NET_INTERFACE"); + errno = EFAULT; + return -1; + + default: + break; + } + +no_fault_inject: + return poll(fds, nfds, timeout); +} + +/* + * testmode_socket + * socket function with faults injected. + * + */ +static int +testmode_socket(const char *caller_name, int domain, int type, int protocol) +{ + if (FINC_HAS_FAULT(FINC_OS_NET_INTERFACE) && + testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + { + LOG(INFO, "inject fault to socket: FINC_OS_NET_INTERFACE"); + errno = ENOMEM; + return -1; + } + + return socket(domain, type, protocol); +} + +/* + * testmode_bind + * bind function with fault injected. + * + */ +static int +testmode_bind(const char *caller_name, int socket, + const struct sockaddr *address, socklen_t address_len) +{ + if (FINC_HAS_FAULT(FINC_OS_NET_INTERFACE) && + testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + { + LOG(INFO, "inject fault to bind: FINC_OS_NET_INTERFACE"); + errno = EFAULT; + return -1; + } + + return bind(socket, address, address_len); +} + +/* + * testmode_getsockname + * getsockname function with faults injected. + * + */ +static int +testmode_getsockname(const char *caller_name, int socket, + struct sockaddr *address, + socklen_t *address_len) +{ + if (FINC_HAS_FAULT(FINC_OS_NET_INTERFACE) && + testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + { + LOG(INFO, "inject fault to getsockname: FINC_OS_NET_INTERFACE"); + errno = EFAULT; + return -1; + } + + return getsockname(socket, address, address_len); +} + +/* + * testmode_setsockopt + * setsockopt with faults injected. + */ +static int +testmode_setsockopt(const char *caller_name, int socket, int level, + int option_name, const void *option_value, + socklen_t option_len) +{ + if (FINC_HAS_FAULT(FINC_OS_NET_INTERFACE) && + testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + { + LOG(INFO, "inject fault to setsockopt: FINC_OS_NET_INTERFACE"); + errno = ENOMEM; + return -1; + } + + return setsockopt(socket, level, option_name, option_value, option_len); +} + +/* + * testmode_pg_getaddrinfo_all + * pg_getaddrinfo_all with faults injected. + */ +static int +testmode_pg_getaddrinfo_all(const char *caller_name, const char *hostname, + const char *servname, const struct addrinfo *hints, + struct addrinfo **res) +{ + if (FINC_HAS_FAULT(FINC_OS_NET_INTERFACE) && + testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + { + LOG(INFO, "inject fault to pg_getaddrinfo_all: FINC_OS_NET_INTERFACE"); + return -1; + } + + //return pg_getaddrinfo_all(hostname, servname, hints, res); + return getaddrinfo(hostname, servname, hints, res); +} + +/* + * testmode_free + * free function with free time tracking added. + */ +static void +testmode_free(const char *caller_name, void *ptr) +{ + if (ptr == NULL) + return; + + pthread_mutex_lock(&icudp_malloc_tracking_lock); + icudp_malloc_times--; + pthread_mutex_unlock(&icudp_malloc_tracking_lock); + free(ptr); +} + +/* + * testmode_pthread_create + * pthread_create with faults injected. + */ +static int +testmode_pthread_create(const char *caller_name, pthread_t *thread, + const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg) +{ + if (FINC_HAS_FAULT(FINC_OS_CREATE_THREAD) && + testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + { + LOG(INFO, "inject fault to pthread_create: FINC_OS_CREATE_THREAD"); + return ENOMEM; + } + + return pthread_create(thread, attr, start_routine, arg); +} + +#ifdef ML_CHECK_FOR_INTERRUPTS +#undef ML_CHECK_FOR_INTERRUPTS +#endif +#undef sendto +#undef recvfrom +#undef poll +#undef socket +#undef bind +#undef getsockname +#undef getsockopt +#undef setsockopt +#undef pg_getaddrinfo_all +#undef malloc +#undef free +#undef palloc0 +#undef pthread_create + +#define ML_CHECK_FOR_INTERRUPTS(teardownActive) \ + testmode_check_interrupts(__FUNCTION__, teardownActive) + +#define sendto(socket, buffer, length, flags, dest_addr, dest_len) \ + testmode_sendto(__FUNCTION__, socket, buffer, length, flags, dest_addr, dest_len) + +#define recvfrom(socket, buffer, length, flags, address, address_len) \ + testmode_recvfrom(__FUNCTION__, socket, buffer, length, flags, address, address_len) + +#define poll(fds, nfds, timeout) \ + testmode_poll(__FUNCTION__, fds, nfds, timeout) + +#define socket(domain, type, protocol) \ + testmode_socket(__FUNCTION__, domain, type, protocol) + +#define bind(socket, address, address_len) \ + testmode_bind(__FUNCTION__, socket, address, address_len) + +#define getsockname(socket, address, address_len) \ + testmode_getsockname(__FUNCTION__, socket, address, address_len) + +#define getsockopt(socket, level, option_name, option_value, option_len) \ + testmode_getsockopt(__FUNCTION__, socket, level, option_name, option_value, option_len) + +#define setsockopt(socket, level, option_name, option_value, option_len) \ + testmode_setsockopt(__FUNCTION__, socket, level, option_name, option_value, option_len) + +#define getaddrinfo(hostname, servname, hints, res) \ + testmode_pg_getaddrinfo_all(__FUNCTION__, hostname, servname, hints, res) + +#define malloc(size) \ + testmode_malloc(__FUNCTION__, size) + +#define free(ptr) \ + testmode_free(__FUNCTION__, ptr) + +/* +#define ic_malloc0(size) \ + testmode_malloc0(__FUNCTION__, size) +*/ + +#define pthread_create(thread, attr, start_routine, arg) \ + testmode_pthread_create(__FUNCTION__, thread, attr, start_routine, arg) +#endif + +#endif \ No newline at end of file diff --git a/contrib/udp2/ic_common/ic_types.h b/contrib/udp2/ic_common/ic_types.h new file mode 100644 index 00000000000..e31d02e493d --- /dev/null +++ b/contrib/udp2/ic_common/ic_types.h @@ -0,0 +1,212 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_types.h + * + * IDENTIFICATION + * contrib/udp2/ic_common/ic_types.h + * + *------------------------------------------------------------------------- + */ +#ifndef IC_TYPES_H +#define IC_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * data type + */ +typedef signed char int8; /* == 8 bits */ +typedef signed short int16; /* == 16 bits */ +typedef signed int int32; /* == 32 bits */ +typedef unsigned char uint8; /* == 8 bits */ +typedef unsigned short uint16; /* == 16 bits */ +typedef unsigned int uint32; /* == 32 bits */ +typedef long int int64; +typedef unsigned long int uint64; + +typedef uint64 DistributedTransactionId; + +/* the lite version of CdbProcess */ +typedef struct ICCdbProcess +{ + bool valid; + char *listenerAddr; /* Interconnect listener IPv4 address, a C-string */ + int listenerPort; /* Interconnect listener port */ + int pid; /* Backend PID of the process. */ + int contentid; + int dbid; + +} ICCdbProcess; + +/* the lite version of ExecSlice */ +typedef struct ICExecSlice +{ + int sliceIndex; + int parentIndex; + + int numChildren; + int *children; + + int numSegments; + + int numPrimaryProcesses; + ICCdbProcess *primaryProcesses; + +} ICExecSlice; + +/* the lite version of SliceTable */ +typedef struct ICSliceTable +{ + int localSlice; /* Index of the slice to execute. */ + int numSlices; + ICExecSlice *slices; /* Array of ICExecSlice, indexed by SliceIndex */ + uint32 ic_instance_id; + +} ICSliceTable; + +typedef struct ICChunkTransportState +{ + /* keeps track of if we've "activated" connections via SetupInterconnect(). */ + bool activated; + bool teardownActive; + + /* slice table stuff. */ + struct ICSliceTable *sliceTable; + int sliceId; + int icInstanceId; /* the same as sliceTable->ic_instance_id */ + + /* whether we've logged when network timeout happens */ + bool networkTimeoutIsLogged; + + /* save client's state */ + void *clientState; + +} ICChunkTransportState; + +struct MemoryBlock +{ + unsigned char *pos; + int len; +}; + +typedef struct MemoryBlock BufferBlock; +typedef struct MemoryBlock DataBlock; + +typedef int (*GetDataLenInPacket)(unsigned char *msg, int msg_size); + +/* + * GlobalMotionLayerIPCParam and SessionMotionLayerIPCParam + */ +typedef bool (*CheckPostmasterIsAliveCallback)(void); +typedef void (*CheckInterruptsCallback)(int teardownActive); +typedef void (*SimpleFaultInjectorCallback)(const char *faultname); +typedef void *(*CreateOpaqueDataWithConn)(void); +typedef void (*DestroyOpaqueDataInConn)(void **); +typedef void (*CheckCancelOnQDCallback)(ICChunkTransportState *pTransportStates); + +typedef struct GlobalMotionLayerIPCParam +{ + char *interconnect_address; /* postmaster.h */ + int Gp_role; /* Gp_role */ + int ic_htab_size; /* cdbgang.h */ + int segment_number; /* getgpsegmentCount() */ + int MyProcPid; /* miscadmin.h */ + int dbid; /* GpIdentity */ + int segindex; /* GpIdentity */ + bool MyProcPort; /* miscadmin.h */ + int myprocport_sock; /* MyProcPort->sock */ + int Gp_max_packet_size; /* default: 8192 */ + int Gp_udp_bufsize_k; /* default: 0 */ + int Gp_interconnect_address_type; /* default: INTERCONNECT_ADDRESS_TYPE_UNICAST_IC */ + + CheckPostmasterIsAliveCallback checkPostmasterIsAliveCallback; + CheckInterruptsCallback checkInterruptsCallback; + SimpleFaultInjectorCallback simpleFaultInjectorCallback; + + CreateOpaqueDataWithConn createOpaqueDataCallback; + DestroyOpaqueDataInConn destroyOpaqueDataCallback; + + CheckCancelOnQDCallback checkCancelOnQDCallback; + +} GlobalMotionLayerIPCParam; + +typedef struct SessionMotionLayerIPCParam +{ + int Gp_interconnect_queue_depth; /* default: 4 */ + int Gp_interconnect_snd_queue_depth; /* default: 2 */ + int Gp_interconnect_cursor_ic_table_size; /* default: 128 */ + int Gp_interconnect_timer_period; /* default: 5 */ + int Gp_interconnect_timer_checking_period; /* default: 20 */ + int Gp_interconnect_default_rtt; /* default: 20 */ + int Gp_interconnect_min_rto; /* default: 20 */ + int Gp_interconnect_transmit_timeout; /* default: 3600 */ + int Gp_interconnect_min_retries_before_timeout; /* default: 100 */ + int Gp_interconnect_debug_retry_interval; /* default: 10 */ + bool gp_interconnect_full_crc; /* default: false */ + bool gp_interconnect_aggressive_retry; /* default: true */ + bool gp_interconnect_cache_future_packets; /* default: true */ + bool gp_interconnect_log_stats; /* default: false */ + int interconnect_setup_timeout; /* default: 7200 */ + int gp_log_interconnect; /* default: terse */ + int gp_session_id; /* global unique id for session. */ + int Gp_interconnect_fc_method; /* default: INTERCONNECT_FC_METHOD_LOSS */ + int gp_command_count; + uint32 gp_interconnect_id; + int log_min_messages; /* default: IC_WARNING */ + DistributedTransactionId distTransId; /* default: 0 */ + + int gp_udpic_dropseg; /* default: -2 */ + int gp_udpic_dropacks_percent; /* default: 0 */ + int gp_udpic_dropxmit_percent; /* default: 0 */ + int gp_udpic_fault_inject_percent; /* default: 0 */ + int gp_udpic_fault_inject_bitmap; /* default: 0 */ + int gp_udpic_network_disable_ipv6; /* default: 0 */ + +} SessionMotionLayerIPCParam; + +/* + * handle error + */ +#define MSGLEN 1024 + +typedef enum ErrorLevel +{ + LEVEL_OK, + LEVEL_ERROR, + LEVEL_FATAL, +} ErrorLevel; + +typedef struct ICError +{ + ErrorLevel level; + char msg[MSGLEN]; +} ICError; + +extern void ResetLastError(); +extern ICError* GetLastError(); +extern void SetLastError(ErrorLevel level, const char *msg); + +#ifdef __cplusplus +} +#endif + +#endif // IC_TYPES_H \ No newline at end of file diff --git a/contrib/udp2/ic_common/ic_utility.cpp b/contrib/udp2/ic_common/ic_utility.cpp new file mode 100644 index 00000000000..7a5a0747238 --- /dev/null +++ b/contrib/udp2/ic_common/ic_utility.cpp @@ -0,0 +1,1371 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_utility.cpp + * + * IDENTIFICATION + * contrib/udp2/ic_common/ic_utility.cpp + * + *------------------------------------------------------------------------- + */ + +#include + +#include "ic_types.h" +#include "ic_utility.hpp" + +/* + * global_param and session_param; + */ +GlobalMotionLayerIPCParam global_param = + {NULL, /* interconnect_address */ + -1, /* Gp_role */ + 0, /* ic_htab_size */ + 0, /* segment_number */ + -1, /* MyProcPid */ + -1, /* dbid */ + -1, /* segindex */ + false,/* MyProcPort */ + -1, /* myprocport_sock */ + 8192, /* Gp_max_packet_size */ + 0, /* Gp_udp_bufsize_k */ + INTERCONNECT_ADDRESS_TYPE_UNICAST_IC, /* Gp_interconnect_address_type */ + NULL, /* CheckPostmasterIsAliveCallback */ + NULL, /* checkInterruptsCallback */ + NULL, /* simpleFaultInjectorCallback */ + NULL, /* createOpaqueDataCallback */ + NULL, /* destroyOpaqueDataCallback */ + NULL, /* checkCancelOnQDCallback */ + }; + +SessionMotionLayerIPCParam session_param = + {4, /* Gp_interconnect_queue_depth */ + 2, /* Gp_interconnect_snd_queue_depth */ + 128, /* Gp_interconnect_cursor_ic_table_size */ + 5, /* Gp_interconnect_timer_period */ + 20, /* Gp_interconnect_timer_checking_period */ + 20, /* Gp_interconnect_default_rtt */ + 20, /* Gp_interconnect_min_rto */ + 3600, /* Gp_interconnect_transmit_timeout */ + 100, /* Gp_interconnect_min_retries_before_timeout */ + 10, /* Gp_interconnect_debug_retry_interval */ + false,/* gp_interconnect_full_crc */ + true, /* gp_interconnect_aggressive_retry */ + true, /* gp_interconnect_cache_future_packets */ + false,/* gp_interconnect_log_stats */ + 7200, /* interconnect_setup_timeout */ + GPVARS_VERBOSITY_TERSE_IC, /* gp_log_interconnect */ + -1, /* gp_session_id */ + INTERCONNECT_FC_METHOD_LOSS_IC, /* Gp_interconnect_fc_method */ + -1, /* gp_command_count */ + 0, /* gp_interconnect_id */ + IC_WARNING, /* log_min_messages */ + 0, /* distTransId */ + + UNDEF_SEGMENT, /* gp_udpic_dropseg */ + 0, /* gp_udpic_dropacks_percent */ + 0, /* gp_udpic_dropxmit_percent */ + 0, /* gp_udpic_fault_inject_percent */ + 0, /* gp_udpic_fault_inject_bitmap */ + 0, /* gp_udpic_network_disable_ipv6 */ + }; + + +/* + * logger stuff + */ +Logger RootLogger; +static std::mutex LoggerMutex; +static thread_local std::once_flag Once; +static thread_local char ProcessId[64]; + +const char * SeverityName[] = { "FATAL", "ERROR", "WARNING", "INFO", "DEBUG1", + "DEBUG2", "DEBUG3", "DEBUG4", "DEBUG5" + }; + +static void InitProcessId() { + std::stringstream ss; + ss.imbue(std::locale::classic()); + ss << "p" << getpid() << ", th" << pthread_self(); + snprintf(ProcessId, sizeof(ProcessId), "%s", ss.str().c_str()); +} + +Logger::Logger() : + fd(STDERR_FILENO), severity(DEFAULT_LOG_LEVEL) { +} + +Logger::~Logger() { +} + +void Logger::setOutputFd(int f) { + fd = f; +} + +void Logger::setLogSeverity(LogSeverity l) { + severity.store(l, std::memory_order_relaxed); +} + +void Logger::printf(LogSeverity s, const char * fmt, ...) { + va_list ap; + + if (s > severity || fd < 0) { + return; + } + + try { + call_once(Once, InitProcessId); + std::vector buffer; + struct tm tm_time; + struct timeval tval; + memset(&tval, 0, sizeof(tval)); + gettimeofday(&tval, NULL); + localtime_r(&tval.tv_sec, &tm_time); + //determine buffer size + va_start(ap, fmt); + int size = vsnprintf(nullptr, 0, fmt, ap); + va_end(ap); + //100 is enough for prefix + buffer.resize(size + 100); + size = snprintf(buffer.data(), buffer.size(), "%04d-%02d-%02d %02d:%02d:%02d.%06ld, %s, %s ", tm_time.tm_year + 1900, + 1 + tm_time.tm_mon, tm_time.tm_mday, tm_time.tm_hour, + tm_time.tm_min, tm_time.tm_sec, static_cast(tval.tv_usec), ProcessId, SeverityName[s]); + va_start(ap, fmt); + size += vsnprintf(buffer.data() + size, buffer.size() - size, fmt, ap); + va_end(ap); + { + std::lock_guard lock(LoggerMutex); + dprintf(fd, "%s\n", buffer.data()); + } + return; + } catch (const std::exception & e) { + dprintf(fd, "%s:%d %s %s", __FILE__, __LINE__, + "FATAL: get an unexpected exception:", e.what()); + throw; + } +} + +/* + * crc32 + */ +static uint32 comp_crc32c_sb8(uint32 crc, const void *data, size_t len); + +/* + * Use slicing-by-8 algorithm. + * + * On big-endian systems, the intermediate value is kept in reverse byte + * order, to avoid byte-swapping during the calculation. FIN_CRC32C reverses + * the bytes to the final order. + */ +#define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF) +#define COMP_CRC32C(crc, data, len) \ + ((crc) = comp_crc32c_sb8((crc), (data), (len))) +#ifdef WORDS_BIGENDIAN +#define FIN_CRC32C(crc) ((crc) = ic_bswap32(crc) ^ 0xFFFFFFFF) +#else +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +#endif + +uint32 +ComputeCRC(const void *data, size_t len) +{ + uint32 local_crc; + + INIT_CRC32C(local_crc); + COMP_CRC32C(local_crc, data, len); + FIN_CRC32C(local_crc); + + return local_crc; +} + +/* + * Lookup tables for the slicing-by-8 algorithm, for the so-called Castagnoli + * polynomial (the same that is used e.g. in iSCSI), 0x1EDC6F41. Using + * Williams' terms, this is the "normal", not "reflected" version. However, on + * big-endian systems the values in the tables are stored in byte-reversed + * order (IOW, the tables are stored in little-endian order even on big-endian + * systems). + */ +static const uint32 crc32c_table[8][256] = { +#ifndef WORDS_BIGENDIAN + { + 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, + 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, + 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, + 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, + 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, + 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, + 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, + 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, + 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, + 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, + 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, + 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, + 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, + 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, + 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, + 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, + 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, + 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, + 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, + 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, + 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, + 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, + 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, + 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, + 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, + 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, + 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, + 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, + 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, + 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, + 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, + 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, + 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, + 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, + 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, + 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, + 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, + 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, + 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, + 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, + 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, + 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, + 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, + 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, + 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, + 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, + 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, + 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, + 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, + 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, + 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, + 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, + 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, + 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, + 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, + 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, + 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, + 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, + 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, + 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, + 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, + 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, + 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, + 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 + }, + { + 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, + 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945, + 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, + 0xD39EA264, 0xC03C3A13, 0xF4DB928A, 0xE7790AFD, + 0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918, + 0x714F905D, 0x62ED082A, 0x560AA0B3, 0x45A838C4, + 0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0, + 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C, + 0x7F8BE302, 0x6C297B75, 0x58CED3EC, 0x4B6C4B9B, + 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47, + 0xE29F20BA, 0xF13DB8CD, 0xC5DA1054, 0xD6788823, + 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF, + 0x404E1283, 0x53EC8AF4, 0x670B226D, 0x74A9BA1A, + 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6, + 0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2, + 0x93D0B0E7, 0x80722890, 0xB4958009, 0xA737187E, + 0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D, + 0xB19DA7D8, 0xA23F3FAF, 0x96D89736, 0x857A0F41, + 0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25, + 0x2C896460, 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9, + 0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C, + 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0, + 0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, 0x69215CA4, + 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78, + 0x809C2506, 0x933EBD71, 0xA7D915E8, 0xB47B8D9F, + 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43, + 0x1D88E6BE, 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27, + 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB, + 0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E, + 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, 0xC5341DC2, + 0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6, + 0x6CC776E3, 0x7F65EE94, 0x4B82460D, 0x5820DE7A, + 0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260, + 0xB5499B25, 0xA6EB0352, 0x920CABCB, 0x81AE33BC, + 0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8, + 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004, + 0xC4060B78, 0xD7A4930F, 0xE3433B96, 0xF0E1A3E1, + 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D, + 0x5912C8C0, 0x4AB050B7, 0x7E57F82E, 0x6DF56059, + 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185, + 0x844819FB, 0x97EA818C, 0xA30D2915, 0xB0AFB162, + 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE, + 0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA, + 0x57D6BB9F, 0x447423E8, 0x70938B71, 0x63311306, + 0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3, + 0xF50789A6, 0xE6A511D1, 0xD242B948, 0xC1E0213F, + 0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B, + 0x68134A1E, 0x7BB1D269, 0x4F567AF0, 0x5CF4E287, + 0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464, + 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8, + 0x99C0FF45, 0x8A626732, 0xBE85CFAB, 0xAD2757DC, + 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600, + 0x3B11CD7C, 0x28B3550B, 0x1C54FD92, 0x0FF665E5, + 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439, + 0xA6050EC4, 0xB5A796B3, 0x81403E2A, 0x92E2A65D, + 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781, + 0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766, + 0x35D5BE23, 0x26772654, 0x12908ECD, 0x013216BA, + 0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE, + 0xA8C17D9B, 0xBB63E5EC, 0x8F844D75, 0x9C26D502, + 0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7, + 0x0A104FA2, 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B, + 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, + 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483 + }, + { + 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, + 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469, + 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, + 0xA68F9ADF, 0x03CE08A1, 0xE9E0C8D2, 0x4CA15AAC, + 0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9, + 0xEE7CD990, 0x4B3D4BEE, 0xA1138B9D, 0x045219E3, + 0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C, + 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726, + 0xE144FB14, 0x4405696A, 0xAE2BA919, 0x0B6A3B67, + 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D, + 0xD915C5D1, 0x7C5457AF, 0x967A97DC, 0x333B05A2, + 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8, + 0x91E6869E, 0x34A714E0, 0xDE89D493, 0x7BC846ED, + 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7, + 0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828, + 0x37691C41, 0x92288E3F, 0x78064E4C, 0xDD47DC32, + 0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA, + 0x59BB24C3, 0xFCFAB6BD, 0x16D476CE, 0xB395E4B0, + 0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F, + 0x61EA1A06, 0xC4AB8878, 0x2E85480B, 0x8BC4DA75, + 0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20, + 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A, + 0x8F96C396, 0x2AD751E8, 0xC0F9919B, 0x65B803E5, + 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF, + 0x26217BCD, 0x8360E9B3, 0x694E29C0, 0xCC0FBBBE, + 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4, + 0x1E704508, 0xBB31D776, 0x511F1705, 0xF45E857B, + 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161, + 0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634, + 0xC85DA25D, 0x6D1C3023, 0x8732F050, 0x2273622E, + 0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1, + 0xF00C9C98, 0x554D0EE6, 0xBF63CE95, 0x1A225CEB, + 0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730, + 0x15F9D359, 0xB0B84127, 0x5A968154, 0xFFD7132A, + 0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5, + 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF, + 0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, 0x11ABCABA, + 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0, + 0xC3D4340C, 0x6695A672, 0x8CBB6601, 0x29FAF47F, + 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065, + 0x6A638C57, 0xCF221E29, 0x250CDE5A, 0x804D4C24, + 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E, + 0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1, + 0xCCEC1688, 0x69AD84F6, 0x83834485, 0x26C2D6FB, + 0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE, + 0x841F55C7, 0x215EC7B9, 0xCB7007CA, 0x6E3195B4, + 0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B, + 0xBC4E6B02, 0x190FF97C, 0xF321390F, 0x5660AB71, + 0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9, + 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3, + 0x7413C95F, 0xD1525B21, 0x3B7C9B52, 0x9E3D092C, + 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36, + 0x3CE08A10, 0x99A1186E, 0x738FD81D, 0xD6CE4A63, + 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79, + 0x04B1B4D5, 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6, + 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC, + 0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD, + 0x33D8A894, 0x96993AEA, 0x7CB7FA99, 0xD9F668E7, + 0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238, + 0x0B899651, 0xAEC8042F, 0x44E6C45C, 0xE1A75622, + 0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177, + 0x437AD51E, 0xE63B4760, 0x0C158713, 0xA954156D, + 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, + 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8 + }, + { + 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, + 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA, + 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, + 0x8D665215, 0x5023F8AD, 0x32017194, 0xEF44DB2C, + 0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804, + 0x924680CE, 0x4F032A76, 0x2D21A34F, 0xF06409F7, + 0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2, + 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11, + 0xD725148B, 0x0A60BE33, 0x6842370A, 0xB5079DB2, + 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41, + 0x2161776D, 0xFC24DDD5, 0x9E0654EC, 0x4343FE54, + 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7, + 0x3E41A5B6, 0xE3040F0E, 0x81268637, 0x5C632C8F, + 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C, + 0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69, + 0xB327F7A3, 0x6E625D1B, 0x0C40D422, 0xD1057E9A, + 0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE, + 0xD0846E14, 0x0DC1C4AC, 0x6FE34D95, 0xB2A6E72D, + 0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538, + 0x26C00DF2, 0xFB85A74A, 0x99A72E73, 0x44E284CB, + 0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3, + 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610, + 0xB4868D3C, 0x69C32784, 0x0BE1AEBD, 0xD6A40405, + 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6, + 0x7C834B6C, 0xA1C6E1D4, 0xC3E468ED, 0x1EA1C255, + 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6, + 0x8AC7288A, 0x57828232, 0x35A00B0B, 0xE8E5A1B3, + 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040, + 0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368, + 0xEEC5CBA2, 0x3380611A, 0x51A2E823, 0x8CE7429B, + 0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E, + 0x1881A844, 0xC5C402FC, 0xA7E68BC5, 0x7AA3217D, + 0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006, + 0x2982F8CC, 0xF4C75274, 0x96E5DB4D, 0x4BA071F5, + 0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0, + 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213, + 0xBBC47802, 0x6681D2BA, 0x04A35B83, 0xD9E6F13B, + 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8, + 0x4D801BE4, 0x90C5B15C, 0xF2E73865, 0x2FA292DD, + 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E, + 0x8585DDB4, 0x58C0770C, 0x3AE2FE35, 0xE7A7548D, + 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E, + 0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B, + 0x08E38FA1, 0xD5A62519, 0xB784AC20, 0x6AC10698, + 0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0, + 0x17C35D7A, 0xCA86F7C2, 0xA8A47EFB, 0x75E1D443, + 0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656, + 0xE1873E9C, 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5, + 0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1, + 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12, + 0x0F42F53E, 0xD2075F86, 0xB025D6BF, 0x6D607C07, + 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4, + 0x106227E5, 0xCD278D5D, 0xAF050464, 0x7240AEDC, + 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F, + 0xE6264403, 0x3B63EEBB, 0x59416782, 0x8404CD3A, + 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9, + 0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A, + 0x5501B3A0, 0x88441918, 0xEA669021, 0x37233A99, + 0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C, + 0xA345D046, 0x7E007AFE, 0x1C22F3C7, 0xC167597F, + 0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57, + 0xBC65029D, 0x6120A825, 0x0302211C, 0xDE478BA4, + 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, + 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842 + }, + { + 0x00000000, 0x38116FAC, 0x7022DF58, 0x4833B0F4, + 0xE045BEB0, 0xD854D11C, 0x906761E8, 0xA8760E44, + 0xC5670B91, 0xFD76643D, 0xB545D4C9, 0x8D54BB65, + 0x2522B521, 0x1D33DA8D, 0x55006A79, 0x6D1105D5, + 0x8F2261D3, 0xB7330E7F, 0xFF00BE8B, 0xC711D127, + 0x6F67DF63, 0x5776B0CF, 0x1F45003B, 0x27546F97, + 0x4A456A42, 0x725405EE, 0x3A67B51A, 0x0276DAB6, + 0xAA00D4F2, 0x9211BB5E, 0xDA220BAA, 0xE2336406, + 0x1BA8B557, 0x23B9DAFB, 0x6B8A6A0F, 0x539B05A3, + 0xFBED0BE7, 0xC3FC644B, 0x8BCFD4BF, 0xB3DEBB13, + 0xDECFBEC6, 0xE6DED16A, 0xAEED619E, 0x96FC0E32, + 0x3E8A0076, 0x069B6FDA, 0x4EA8DF2E, 0x76B9B082, + 0x948AD484, 0xAC9BBB28, 0xE4A80BDC, 0xDCB96470, + 0x74CF6A34, 0x4CDE0598, 0x04EDB56C, 0x3CFCDAC0, + 0x51EDDF15, 0x69FCB0B9, 0x21CF004D, 0x19DE6FE1, + 0xB1A861A5, 0x89B90E09, 0xC18ABEFD, 0xF99BD151, + 0x37516AAE, 0x0F400502, 0x4773B5F6, 0x7F62DA5A, + 0xD714D41E, 0xEF05BBB2, 0xA7360B46, 0x9F2764EA, + 0xF236613F, 0xCA270E93, 0x8214BE67, 0xBA05D1CB, + 0x1273DF8F, 0x2A62B023, 0x625100D7, 0x5A406F7B, + 0xB8730B7D, 0x806264D1, 0xC851D425, 0xF040BB89, + 0x5836B5CD, 0x6027DA61, 0x28146A95, 0x10050539, + 0x7D1400EC, 0x45056F40, 0x0D36DFB4, 0x3527B018, + 0x9D51BE5C, 0xA540D1F0, 0xED736104, 0xD5620EA8, + 0x2CF9DFF9, 0x14E8B055, 0x5CDB00A1, 0x64CA6F0D, + 0xCCBC6149, 0xF4AD0EE5, 0xBC9EBE11, 0x848FD1BD, + 0xE99ED468, 0xD18FBBC4, 0x99BC0B30, 0xA1AD649C, + 0x09DB6AD8, 0x31CA0574, 0x79F9B580, 0x41E8DA2C, + 0xA3DBBE2A, 0x9BCAD186, 0xD3F96172, 0xEBE80EDE, + 0x439E009A, 0x7B8F6F36, 0x33BCDFC2, 0x0BADB06E, + 0x66BCB5BB, 0x5EADDA17, 0x169E6AE3, 0x2E8F054F, + 0x86F90B0B, 0xBEE864A7, 0xF6DBD453, 0xCECABBFF, + 0x6EA2D55C, 0x56B3BAF0, 0x1E800A04, 0x269165A8, + 0x8EE76BEC, 0xB6F60440, 0xFEC5B4B4, 0xC6D4DB18, + 0xABC5DECD, 0x93D4B161, 0xDBE70195, 0xE3F66E39, + 0x4B80607D, 0x73910FD1, 0x3BA2BF25, 0x03B3D089, + 0xE180B48F, 0xD991DB23, 0x91A26BD7, 0xA9B3047B, + 0x01C50A3F, 0x39D46593, 0x71E7D567, 0x49F6BACB, + 0x24E7BF1E, 0x1CF6D0B2, 0x54C56046, 0x6CD40FEA, + 0xC4A201AE, 0xFCB36E02, 0xB480DEF6, 0x8C91B15A, + 0x750A600B, 0x4D1B0FA7, 0x0528BF53, 0x3D39D0FF, + 0x954FDEBB, 0xAD5EB117, 0xE56D01E3, 0xDD7C6E4F, + 0xB06D6B9A, 0x887C0436, 0xC04FB4C2, 0xF85EDB6E, + 0x5028D52A, 0x6839BA86, 0x200A0A72, 0x181B65DE, + 0xFA2801D8, 0xC2396E74, 0x8A0ADE80, 0xB21BB12C, + 0x1A6DBF68, 0x227CD0C4, 0x6A4F6030, 0x525E0F9C, + 0x3F4F0A49, 0x075E65E5, 0x4F6DD511, 0x777CBABD, + 0xDF0AB4F9, 0xE71BDB55, 0xAF286BA1, 0x9739040D, + 0x59F3BFF2, 0x61E2D05E, 0x29D160AA, 0x11C00F06, + 0xB9B60142, 0x81A76EEE, 0xC994DE1A, 0xF185B1B6, + 0x9C94B463, 0xA485DBCF, 0xECB66B3B, 0xD4A70497, + 0x7CD10AD3, 0x44C0657F, 0x0CF3D58B, 0x34E2BA27, + 0xD6D1DE21, 0xEEC0B18D, 0xA6F30179, 0x9EE26ED5, + 0x36946091, 0x0E850F3D, 0x46B6BFC9, 0x7EA7D065, + 0x13B6D5B0, 0x2BA7BA1C, 0x63940AE8, 0x5B856544, + 0xF3F36B00, 0xCBE204AC, 0x83D1B458, 0xBBC0DBF4, + 0x425B0AA5, 0x7A4A6509, 0x3279D5FD, 0x0A68BA51, + 0xA21EB415, 0x9A0FDBB9, 0xD23C6B4D, 0xEA2D04E1, + 0x873C0134, 0xBF2D6E98, 0xF71EDE6C, 0xCF0FB1C0, + 0x6779BF84, 0x5F68D028, 0x175B60DC, 0x2F4A0F70, + 0xCD796B76, 0xF56804DA, 0xBD5BB42E, 0x854ADB82, + 0x2D3CD5C6, 0x152DBA6A, 0x5D1E0A9E, 0x650F6532, + 0x081E60E7, 0x300F0F4B, 0x783CBFBF, 0x402DD013, + 0xE85BDE57, 0xD04AB1FB, 0x9879010F, 0xA0686EA3 + }, + { + 0x00000000, 0xEF306B19, 0xDB8CA0C3, 0x34BCCBDA, + 0xB2F53777, 0x5DC55C6E, 0x697997B4, 0x8649FCAD, + 0x6006181F, 0x8F367306, 0xBB8AB8DC, 0x54BAD3C5, + 0xD2F32F68, 0x3DC34471, 0x097F8FAB, 0xE64FE4B2, + 0xC00C303E, 0x2F3C5B27, 0x1B8090FD, 0xF4B0FBE4, + 0x72F90749, 0x9DC96C50, 0xA975A78A, 0x4645CC93, + 0xA00A2821, 0x4F3A4338, 0x7B8688E2, 0x94B6E3FB, + 0x12FF1F56, 0xFDCF744F, 0xC973BF95, 0x2643D48C, + 0x85F4168D, 0x6AC47D94, 0x5E78B64E, 0xB148DD57, + 0x370121FA, 0xD8314AE3, 0xEC8D8139, 0x03BDEA20, + 0xE5F20E92, 0x0AC2658B, 0x3E7EAE51, 0xD14EC548, + 0x570739E5, 0xB83752FC, 0x8C8B9926, 0x63BBF23F, + 0x45F826B3, 0xAAC84DAA, 0x9E748670, 0x7144ED69, + 0xF70D11C4, 0x183D7ADD, 0x2C81B107, 0xC3B1DA1E, + 0x25FE3EAC, 0xCACE55B5, 0xFE729E6F, 0x1142F576, + 0x970B09DB, 0x783B62C2, 0x4C87A918, 0xA3B7C201, + 0x0E045BEB, 0xE13430F2, 0xD588FB28, 0x3AB89031, + 0xBCF16C9C, 0x53C10785, 0x677DCC5F, 0x884DA746, + 0x6E0243F4, 0x813228ED, 0xB58EE337, 0x5ABE882E, + 0xDCF77483, 0x33C71F9A, 0x077BD440, 0xE84BBF59, + 0xCE086BD5, 0x213800CC, 0x1584CB16, 0xFAB4A00F, + 0x7CFD5CA2, 0x93CD37BB, 0xA771FC61, 0x48419778, + 0xAE0E73CA, 0x413E18D3, 0x7582D309, 0x9AB2B810, + 0x1CFB44BD, 0xF3CB2FA4, 0xC777E47E, 0x28478F67, + 0x8BF04D66, 0x64C0267F, 0x507CEDA5, 0xBF4C86BC, + 0x39057A11, 0xD6351108, 0xE289DAD2, 0x0DB9B1CB, + 0xEBF65579, 0x04C63E60, 0x307AF5BA, 0xDF4A9EA3, + 0x5903620E, 0xB6330917, 0x828FC2CD, 0x6DBFA9D4, + 0x4BFC7D58, 0xA4CC1641, 0x9070DD9B, 0x7F40B682, + 0xF9094A2F, 0x16392136, 0x2285EAEC, 0xCDB581F5, + 0x2BFA6547, 0xC4CA0E5E, 0xF076C584, 0x1F46AE9D, + 0x990F5230, 0x763F3929, 0x4283F2F3, 0xADB399EA, + 0x1C08B7D6, 0xF338DCCF, 0xC7841715, 0x28B47C0C, + 0xAEFD80A1, 0x41CDEBB8, 0x75712062, 0x9A414B7B, + 0x7C0EAFC9, 0x933EC4D0, 0xA7820F0A, 0x48B26413, + 0xCEFB98BE, 0x21CBF3A7, 0x1577387D, 0xFA475364, + 0xDC0487E8, 0x3334ECF1, 0x0788272B, 0xE8B84C32, + 0x6EF1B09F, 0x81C1DB86, 0xB57D105C, 0x5A4D7B45, + 0xBC029FF7, 0x5332F4EE, 0x678E3F34, 0x88BE542D, + 0x0EF7A880, 0xE1C7C399, 0xD57B0843, 0x3A4B635A, + 0x99FCA15B, 0x76CCCA42, 0x42700198, 0xAD406A81, + 0x2B09962C, 0xC439FD35, 0xF08536EF, 0x1FB55DF6, + 0xF9FAB944, 0x16CAD25D, 0x22761987, 0xCD46729E, + 0x4B0F8E33, 0xA43FE52A, 0x90832EF0, 0x7FB345E9, + 0x59F09165, 0xB6C0FA7C, 0x827C31A6, 0x6D4C5ABF, + 0xEB05A612, 0x0435CD0B, 0x308906D1, 0xDFB96DC8, + 0x39F6897A, 0xD6C6E263, 0xE27A29B9, 0x0D4A42A0, + 0x8B03BE0D, 0x6433D514, 0x508F1ECE, 0xBFBF75D7, + 0x120CEC3D, 0xFD3C8724, 0xC9804CFE, 0x26B027E7, + 0xA0F9DB4A, 0x4FC9B053, 0x7B757B89, 0x94451090, + 0x720AF422, 0x9D3A9F3B, 0xA98654E1, 0x46B63FF8, + 0xC0FFC355, 0x2FCFA84C, 0x1B736396, 0xF443088F, + 0xD200DC03, 0x3D30B71A, 0x098C7CC0, 0xE6BC17D9, + 0x60F5EB74, 0x8FC5806D, 0xBB794BB7, 0x544920AE, + 0xB206C41C, 0x5D36AF05, 0x698A64DF, 0x86BA0FC6, + 0x00F3F36B, 0xEFC39872, 0xDB7F53A8, 0x344F38B1, + 0x97F8FAB0, 0x78C891A9, 0x4C745A73, 0xA344316A, + 0x250DCDC7, 0xCA3DA6DE, 0xFE816D04, 0x11B1061D, + 0xF7FEE2AF, 0x18CE89B6, 0x2C72426C, 0xC3422975, + 0x450BD5D8, 0xAA3BBEC1, 0x9E87751B, 0x71B71E02, + 0x57F4CA8E, 0xB8C4A197, 0x8C786A4D, 0x63480154, + 0xE501FDF9, 0x0A3196E0, 0x3E8D5D3A, 0xD1BD3623, + 0x37F2D291, 0xD8C2B988, 0xEC7E7252, 0x034E194B, + 0x8507E5E6, 0x6A378EFF, 0x5E8B4525, 0xB1BB2E3C + }, + { + 0x00000000, 0x68032CC8, 0xD0065990, 0xB8057558, + 0xA5E0C5D1, 0xCDE3E919, 0x75E69C41, 0x1DE5B089, + 0x4E2DFD53, 0x262ED19B, 0x9E2BA4C3, 0xF628880B, + 0xEBCD3882, 0x83CE144A, 0x3BCB6112, 0x53C84DDA, + 0x9C5BFAA6, 0xF458D66E, 0x4C5DA336, 0x245E8FFE, + 0x39BB3F77, 0x51B813BF, 0xE9BD66E7, 0x81BE4A2F, + 0xD27607F5, 0xBA752B3D, 0x02705E65, 0x6A7372AD, + 0x7796C224, 0x1F95EEEC, 0xA7909BB4, 0xCF93B77C, + 0x3D5B83BD, 0x5558AF75, 0xED5DDA2D, 0x855EF6E5, + 0x98BB466C, 0xF0B86AA4, 0x48BD1FFC, 0x20BE3334, + 0x73767EEE, 0x1B755226, 0xA370277E, 0xCB730BB6, + 0xD696BB3F, 0xBE9597F7, 0x0690E2AF, 0x6E93CE67, + 0xA100791B, 0xC90355D3, 0x7106208B, 0x19050C43, + 0x04E0BCCA, 0x6CE39002, 0xD4E6E55A, 0xBCE5C992, + 0xEF2D8448, 0x872EA880, 0x3F2BDDD8, 0x5728F110, + 0x4ACD4199, 0x22CE6D51, 0x9ACB1809, 0xF2C834C1, + 0x7AB7077A, 0x12B42BB2, 0xAAB15EEA, 0xC2B27222, + 0xDF57C2AB, 0xB754EE63, 0x0F519B3B, 0x6752B7F3, + 0x349AFA29, 0x5C99D6E1, 0xE49CA3B9, 0x8C9F8F71, + 0x917A3FF8, 0xF9791330, 0x417C6668, 0x297F4AA0, + 0xE6ECFDDC, 0x8EEFD114, 0x36EAA44C, 0x5EE98884, + 0x430C380D, 0x2B0F14C5, 0x930A619D, 0xFB094D55, + 0xA8C1008F, 0xC0C22C47, 0x78C7591F, 0x10C475D7, + 0x0D21C55E, 0x6522E996, 0xDD279CCE, 0xB524B006, + 0x47EC84C7, 0x2FEFA80F, 0x97EADD57, 0xFFE9F19F, + 0xE20C4116, 0x8A0F6DDE, 0x320A1886, 0x5A09344E, + 0x09C17994, 0x61C2555C, 0xD9C72004, 0xB1C40CCC, + 0xAC21BC45, 0xC422908D, 0x7C27E5D5, 0x1424C91D, + 0xDBB77E61, 0xB3B452A9, 0x0BB127F1, 0x63B20B39, + 0x7E57BBB0, 0x16549778, 0xAE51E220, 0xC652CEE8, + 0x959A8332, 0xFD99AFFA, 0x459CDAA2, 0x2D9FF66A, + 0x307A46E3, 0x58796A2B, 0xE07C1F73, 0x887F33BB, + 0xF56E0EF4, 0x9D6D223C, 0x25685764, 0x4D6B7BAC, + 0x508ECB25, 0x388DE7ED, 0x808892B5, 0xE88BBE7D, + 0xBB43F3A7, 0xD340DF6F, 0x6B45AA37, 0x034686FF, + 0x1EA33676, 0x76A01ABE, 0xCEA56FE6, 0xA6A6432E, + 0x6935F452, 0x0136D89A, 0xB933ADC2, 0xD130810A, + 0xCCD53183, 0xA4D61D4B, 0x1CD36813, 0x74D044DB, + 0x27180901, 0x4F1B25C9, 0xF71E5091, 0x9F1D7C59, + 0x82F8CCD0, 0xEAFBE018, 0x52FE9540, 0x3AFDB988, + 0xC8358D49, 0xA036A181, 0x1833D4D9, 0x7030F811, + 0x6DD54898, 0x05D66450, 0xBDD31108, 0xD5D03DC0, + 0x8618701A, 0xEE1B5CD2, 0x561E298A, 0x3E1D0542, + 0x23F8B5CB, 0x4BFB9903, 0xF3FEEC5B, 0x9BFDC093, + 0x546E77EF, 0x3C6D5B27, 0x84682E7F, 0xEC6B02B7, + 0xF18EB23E, 0x998D9EF6, 0x2188EBAE, 0x498BC766, + 0x1A438ABC, 0x7240A674, 0xCA45D32C, 0xA246FFE4, + 0xBFA34F6D, 0xD7A063A5, 0x6FA516FD, 0x07A63A35, + 0x8FD9098E, 0xE7DA2546, 0x5FDF501E, 0x37DC7CD6, + 0x2A39CC5F, 0x423AE097, 0xFA3F95CF, 0x923CB907, + 0xC1F4F4DD, 0xA9F7D815, 0x11F2AD4D, 0x79F18185, + 0x6414310C, 0x0C171DC4, 0xB412689C, 0xDC114454, + 0x1382F328, 0x7B81DFE0, 0xC384AAB8, 0xAB878670, + 0xB66236F9, 0xDE611A31, 0x66646F69, 0x0E6743A1, + 0x5DAF0E7B, 0x35AC22B3, 0x8DA957EB, 0xE5AA7B23, + 0xF84FCBAA, 0x904CE762, 0x2849923A, 0x404ABEF2, + 0xB2828A33, 0xDA81A6FB, 0x6284D3A3, 0x0A87FF6B, + 0x17624FE2, 0x7F61632A, 0xC7641672, 0xAF673ABA, + 0xFCAF7760, 0x94AC5BA8, 0x2CA92EF0, 0x44AA0238, + 0x594FB2B1, 0x314C9E79, 0x8949EB21, 0xE14AC7E9, + 0x2ED97095, 0x46DA5C5D, 0xFEDF2905, 0x96DC05CD, + 0x8B39B544, 0xE33A998C, 0x5B3FECD4, 0x333CC01C, + 0x60F48DC6, 0x08F7A10E, 0xB0F2D456, 0xD8F1F89E, + 0xC5144817, 0xAD1764DF, 0x15121187, 0x7D113D4F + }, + { + 0x00000000, 0x493C7D27, 0x9278FA4E, 0xDB448769, + 0x211D826D, 0x6821FF4A, 0xB3657823, 0xFA590504, + 0x423B04DA, 0x0B0779FD, 0xD043FE94, 0x997F83B3, + 0x632686B7, 0x2A1AFB90, 0xF15E7CF9, 0xB86201DE, + 0x847609B4, 0xCD4A7493, 0x160EF3FA, 0x5F328EDD, + 0xA56B8BD9, 0xEC57F6FE, 0x37137197, 0x7E2F0CB0, + 0xC64D0D6E, 0x8F717049, 0x5435F720, 0x1D098A07, + 0xE7508F03, 0xAE6CF224, 0x7528754D, 0x3C14086A, + 0x0D006599, 0x443C18BE, 0x9F789FD7, 0xD644E2F0, + 0x2C1DE7F4, 0x65219AD3, 0xBE651DBA, 0xF759609D, + 0x4F3B6143, 0x06071C64, 0xDD439B0D, 0x947FE62A, + 0x6E26E32E, 0x271A9E09, 0xFC5E1960, 0xB5626447, + 0x89766C2D, 0xC04A110A, 0x1B0E9663, 0x5232EB44, + 0xA86BEE40, 0xE1579367, 0x3A13140E, 0x732F6929, + 0xCB4D68F7, 0x827115D0, 0x593592B9, 0x1009EF9E, + 0xEA50EA9A, 0xA36C97BD, 0x782810D4, 0x31146DF3, + 0x1A00CB32, 0x533CB615, 0x8878317C, 0xC1444C5B, + 0x3B1D495F, 0x72213478, 0xA965B311, 0xE059CE36, + 0x583BCFE8, 0x1107B2CF, 0xCA4335A6, 0x837F4881, + 0x79264D85, 0x301A30A2, 0xEB5EB7CB, 0xA262CAEC, + 0x9E76C286, 0xD74ABFA1, 0x0C0E38C8, 0x453245EF, + 0xBF6B40EB, 0xF6573DCC, 0x2D13BAA5, 0x642FC782, + 0xDC4DC65C, 0x9571BB7B, 0x4E353C12, 0x07094135, + 0xFD504431, 0xB46C3916, 0x6F28BE7F, 0x2614C358, + 0x1700AEAB, 0x5E3CD38C, 0x857854E5, 0xCC4429C2, + 0x361D2CC6, 0x7F2151E1, 0xA465D688, 0xED59ABAF, + 0x553BAA71, 0x1C07D756, 0xC743503F, 0x8E7F2D18, + 0x7426281C, 0x3D1A553B, 0xE65ED252, 0xAF62AF75, + 0x9376A71F, 0xDA4ADA38, 0x010E5D51, 0x48322076, + 0xB26B2572, 0xFB575855, 0x2013DF3C, 0x692FA21B, + 0xD14DA3C5, 0x9871DEE2, 0x4335598B, 0x0A0924AC, + 0xF05021A8, 0xB96C5C8F, 0x6228DBE6, 0x2B14A6C1, + 0x34019664, 0x7D3DEB43, 0xA6796C2A, 0xEF45110D, + 0x151C1409, 0x5C20692E, 0x8764EE47, 0xCE589360, + 0x763A92BE, 0x3F06EF99, 0xE44268F0, 0xAD7E15D7, + 0x572710D3, 0x1E1B6DF4, 0xC55FEA9D, 0x8C6397BA, + 0xB0779FD0, 0xF94BE2F7, 0x220F659E, 0x6B3318B9, + 0x916A1DBD, 0xD856609A, 0x0312E7F3, 0x4A2E9AD4, + 0xF24C9B0A, 0xBB70E62D, 0x60346144, 0x29081C63, + 0xD3511967, 0x9A6D6440, 0x4129E329, 0x08159E0E, + 0x3901F3FD, 0x703D8EDA, 0xAB7909B3, 0xE2457494, + 0x181C7190, 0x51200CB7, 0x8A648BDE, 0xC358F6F9, + 0x7B3AF727, 0x32068A00, 0xE9420D69, 0xA07E704E, + 0x5A27754A, 0x131B086D, 0xC85F8F04, 0x8163F223, + 0xBD77FA49, 0xF44B876E, 0x2F0F0007, 0x66337D20, + 0x9C6A7824, 0xD5560503, 0x0E12826A, 0x472EFF4D, + 0xFF4CFE93, 0xB67083B4, 0x6D3404DD, 0x240879FA, + 0xDE517CFE, 0x976D01D9, 0x4C2986B0, 0x0515FB97, + 0x2E015D56, 0x673D2071, 0xBC79A718, 0xF545DA3F, + 0x0F1CDF3B, 0x4620A21C, 0x9D642575, 0xD4585852, + 0x6C3A598C, 0x250624AB, 0xFE42A3C2, 0xB77EDEE5, + 0x4D27DBE1, 0x041BA6C6, 0xDF5F21AF, 0x96635C88, + 0xAA7754E2, 0xE34B29C5, 0x380FAEAC, 0x7133D38B, + 0x8B6AD68F, 0xC256ABA8, 0x19122CC1, 0x502E51E6, + 0xE84C5038, 0xA1702D1F, 0x7A34AA76, 0x3308D751, + 0xC951D255, 0x806DAF72, 0x5B29281B, 0x1215553C, + 0x230138CF, 0x6A3D45E8, 0xB179C281, 0xF845BFA6, + 0x021CBAA2, 0x4B20C785, 0x906440EC, 0xD9583DCB, + 0x613A3C15, 0x28064132, 0xF342C65B, 0xBA7EBB7C, + 0x4027BE78, 0x091BC35F, 0xD25F4436, 0x9B633911, + 0xA777317B, 0xEE4B4C5C, 0x350FCB35, 0x7C33B612, + 0x866AB316, 0xCF56CE31, 0x14124958, 0x5D2E347F, + 0xE54C35A1, 0xAC704886, 0x7734CFEF, 0x3E08B2C8, + 0xC451B7CC, 0x8D6DCAEB, 0x56294D82, 0x1F1530A5 + } +#else /* !WORDS_BIGENDIAN */ + { + 0x00000000, 0x03836BF2, 0xF7703BE1, 0xF4F35013, + 0x1F979AC7, 0x1C14F135, 0xE8E7A126, 0xEB64CAD4, + 0xCF58D98A, 0xCCDBB278, 0x3828E26B, 0x3BAB8999, + 0xD0CF434D, 0xD34C28BF, 0x27BF78AC, 0x243C135E, + 0x6FC75E10, 0x6C4435E2, 0x98B765F1, 0x9B340E03, + 0x7050C4D7, 0x73D3AF25, 0x8720FF36, 0x84A394C4, + 0xA09F879A, 0xA31CEC68, 0x57EFBC7B, 0x546CD789, + 0xBF081D5D, 0xBC8B76AF, 0x487826BC, 0x4BFB4D4E, + 0xDE8EBD20, 0xDD0DD6D2, 0x29FE86C1, 0x2A7DED33, + 0xC11927E7, 0xC29A4C15, 0x36691C06, 0x35EA77F4, + 0x11D664AA, 0x12550F58, 0xE6A65F4B, 0xE52534B9, + 0x0E41FE6D, 0x0DC2959F, 0xF931C58C, 0xFAB2AE7E, + 0xB149E330, 0xB2CA88C2, 0x4639D8D1, 0x45BAB323, + 0xAEDE79F7, 0xAD5D1205, 0x59AE4216, 0x5A2D29E4, + 0x7E113ABA, 0x7D925148, 0x8961015B, 0x8AE26AA9, + 0x6186A07D, 0x6205CB8F, 0x96F69B9C, 0x9575F06E, + 0xBC1D7B41, 0xBF9E10B3, 0x4B6D40A0, 0x48EE2B52, + 0xA38AE186, 0xA0098A74, 0x54FADA67, 0x5779B195, + 0x7345A2CB, 0x70C6C939, 0x8435992A, 0x87B6F2D8, + 0x6CD2380C, 0x6F5153FE, 0x9BA203ED, 0x9821681F, + 0xD3DA2551, 0xD0594EA3, 0x24AA1EB0, 0x27297542, + 0xCC4DBF96, 0xCFCED464, 0x3B3D8477, 0x38BEEF85, + 0x1C82FCDB, 0x1F019729, 0xEBF2C73A, 0xE871ACC8, + 0x0315661C, 0x00960DEE, 0xF4655DFD, 0xF7E6360F, + 0x6293C661, 0x6110AD93, 0x95E3FD80, 0x96609672, + 0x7D045CA6, 0x7E873754, 0x8A746747, 0x89F70CB5, + 0xADCB1FEB, 0xAE487419, 0x5ABB240A, 0x59384FF8, + 0xB25C852C, 0xB1DFEEDE, 0x452CBECD, 0x46AFD53F, + 0x0D549871, 0x0ED7F383, 0xFA24A390, 0xF9A7C862, + 0x12C302B6, 0x11406944, 0xE5B33957, 0xE63052A5, + 0xC20C41FB, 0xC18F2A09, 0x357C7A1A, 0x36FF11E8, + 0xDD9BDB3C, 0xDE18B0CE, 0x2AEBE0DD, 0x29688B2F, + 0x783BF682, 0x7BB89D70, 0x8F4BCD63, 0x8CC8A691, + 0x67AC6C45, 0x642F07B7, 0x90DC57A4, 0x935F3C56, + 0xB7632F08, 0xB4E044FA, 0x401314E9, 0x43907F1B, + 0xA8F4B5CF, 0xAB77DE3D, 0x5F848E2E, 0x5C07E5DC, + 0x17FCA892, 0x147FC360, 0xE08C9373, 0xE30FF881, + 0x086B3255, 0x0BE859A7, 0xFF1B09B4, 0xFC986246, + 0xD8A47118, 0xDB271AEA, 0x2FD44AF9, 0x2C57210B, + 0xC733EBDF, 0xC4B0802D, 0x3043D03E, 0x33C0BBCC, + 0xA6B54BA2, 0xA5362050, 0x51C57043, 0x52461BB1, + 0xB922D165, 0xBAA1BA97, 0x4E52EA84, 0x4DD18176, + 0x69ED9228, 0x6A6EF9DA, 0x9E9DA9C9, 0x9D1EC23B, + 0x767A08EF, 0x75F9631D, 0x810A330E, 0x828958FC, + 0xC97215B2, 0xCAF17E40, 0x3E022E53, 0x3D8145A1, + 0xD6E58F75, 0xD566E487, 0x2195B494, 0x2216DF66, + 0x062ACC38, 0x05A9A7CA, 0xF15AF7D9, 0xF2D99C2B, + 0x19BD56FF, 0x1A3E3D0D, 0xEECD6D1E, 0xED4E06EC, + 0xC4268DC3, 0xC7A5E631, 0x3356B622, 0x30D5DDD0, + 0xDBB11704, 0xD8327CF6, 0x2CC12CE5, 0x2F424717, + 0x0B7E5449, 0x08FD3FBB, 0xFC0E6FA8, 0xFF8D045A, + 0x14E9CE8E, 0x176AA57C, 0xE399F56F, 0xE01A9E9D, + 0xABE1D3D3, 0xA862B821, 0x5C91E832, 0x5F1283C0, + 0xB4764914, 0xB7F522E6, 0x430672F5, 0x40851907, + 0x64B90A59, 0x673A61AB, 0x93C931B8, 0x904A5A4A, + 0x7B2E909E, 0x78ADFB6C, 0x8C5EAB7F, 0x8FDDC08D, + 0x1AA830E3, 0x192B5B11, 0xEDD80B02, 0xEE5B60F0, + 0x053FAA24, 0x06BCC1D6, 0xF24F91C5, 0xF1CCFA37, + 0xD5F0E969, 0xD673829B, 0x2280D288, 0x2103B97A, + 0xCA6773AE, 0xC9E4185C, 0x3D17484F, 0x3E9423BD, + 0x756F6EF3, 0x76EC0501, 0x821F5512, 0x819C3EE0, + 0x6AF8F434, 0x697B9FC6, 0x9D88CFD5, 0x9E0BA427, + 0xBA37B779, 0xB9B4DC8B, 0x4D478C98, 0x4EC4E76A, + 0xA5A02DBE, 0xA623464C, 0x52D0165F, 0x51537DAD, + }, + { + 0x00000000, 0x7798A213, 0xEE304527, 0x99A8E734, + 0xDC618A4E, 0xABF9285D, 0x3251CF69, 0x45C96D7A, + 0xB8C3149D, 0xCF5BB68E, 0x56F351BA, 0x216BF3A9, + 0x64A29ED3, 0x133A3CC0, 0x8A92DBF4, 0xFD0A79E7, + 0x81F1C53F, 0xF669672C, 0x6FC18018, 0x1859220B, + 0x5D904F71, 0x2A08ED62, 0xB3A00A56, 0xC438A845, + 0x3932D1A2, 0x4EAA73B1, 0xD7029485, 0xA09A3696, + 0xE5535BEC, 0x92CBF9FF, 0x0B631ECB, 0x7CFBBCD8, + 0x02E38B7F, 0x757B296C, 0xECD3CE58, 0x9B4B6C4B, + 0xDE820131, 0xA91AA322, 0x30B24416, 0x472AE605, + 0xBA209FE2, 0xCDB83DF1, 0x5410DAC5, 0x238878D6, + 0x664115AC, 0x11D9B7BF, 0x8871508B, 0xFFE9F298, + 0x83124E40, 0xF48AEC53, 0x6D220B67, 0x1ABAA974, + 0x5F73C40E, 0x28EB661D, 0xB1438129, 0xC6DB233A, + 0x3BD15ADD, 0x4C49F8CE, 0xD5E11FFA, 0xA279BDE9, + 0xE7B0D093, 0x90287280, 0x098095B4, 0x7E1837A7, + 0x04C617FF, 0x735EB5EC, 0xEAF652D8, 0x9D6EF0CB, + 0xD8A79DB1, 0xAF3F3FA2, 0x3697D896, 0x410F7A85, + 0xBC050362, 0xCB9DA171, 0x52354645, 0x25ADE456, + 0x6064892C, 0x17FC2B3F, 0x8E54CC0B, 0xF9CC6E18, + 0x8537D2C0, 0xF2AF70D3, 0x6B0797E7, 0x1C9F35F4, + 0x5956588E, 0x2ECEFA9D, 0xB7661DA9, 0xC0FEBFBA, + 0x3DF4C65D, 0x4A6C644E, 0xD3C4837A, 0xA45C2169, + 0xE1954C13, 0x960DEE00, 0x0FA50934, 0x783DAB27, + 0x06259C80, 0x71BD3E93, 0xE815D9A7, 0x9F8D7BB4, + 0xDA4416CE, 0xADDCB4DD, 0x347453E9, 0x43ECF1FA, + 0xBEE6881D, 0xC97E2A0E, 0x50D6CD3A, 0x274E6F29, + 0x62870253, 0x151FA040, 0x8CB74774, 0xFB2FE567, + 0x87D459BF, 0xF04CFBAC, 0x69E41C98, 0x1E7CBE8B, + 0x5BB5D3F1, 0x2C2D71E2, 0xB58596D6, 0xC21D34C5, + 0x3F174D22, 0x488FEF31, 0xD1270805, 0xA6BFAA16, + 0xE376C76C, 0x94EE657F, 0x0D46824B, 0x7ADE2058, + 0xF9FAC3FB, 0x8E6261E8, 0x17CA86DC, 0x605224CF, + 0x259B49B5, 0x5203EBA6, 0xCBAB0C92, 0xBC33AE81, + 0x4139D766, 0x36A17575, 0xAF099241, 0xD8913052, + 0x9D585D28, 0xEAC0FF3B, 0x7368180F, 0x04F0BA1C, + 0x780B06C4, 0x0F93A4D7, 0x963B43E3, 0xE1A3E1F0, + 0xA46A8C8A, 0xD3F22E99, 0x4A5AC9AD, 0x3DC26BBE, + 0xC0C81259, 0xB750B04A, 0x2EF8577E, 0x5960F56D, + 0x1CA99817, 0x6B313A04, 0xF299DD30, 0x85017F23, + 0xFB194884, 0x8C81EA97, 0x15290DA3, 0x62B1AFB0, + 0x2778C2CA, 0x50E060D9, 0xC94887ED, 0xBED025FE, + 0x43DA5C19, 0x3442FE0A, 0xADEA193E, 0xDA72BB2D, + 0x9FBBD657, 0xE8237444, 0x718B9370, 0x06133163, + 0x7AE88DBB, 0x0D702FA8, 0x94D8C89C, 0xE3406A8F, + 0xA68907F5, 0xD111A5E6, 0x48B942D2, 0x3F21E0C1, + 0xC22B9926, 0xB5B33B35, 0x2C1BDC01, 0x5B837E12, + 0x1E4A1368, 0x69D2B17B, 0xF07A564F, 0x87E2F45C, + 0xFD3CD404, 0x8AA47617, 0x130C9123, 0x64943330, + 0x215D5E4A, 0x56C5FC59, 0xCF6D1B6D, 0xB8F5B97E, + 0x45FFC099, 0x3267628A, 0xABCF85BE, 0xDC5727AD, + 0x999E4AD7, 0xEE06E8C4, 0x77AE0FF0, 0x0036ADE3, + 0x7CCD113B, 0x0B55B328, 0x92FD541C, 0xE565F60F, + 0xA0AC9B75, 0xD7343966, 0x4E9CDE52, 0x39047C41, + 0xC40E05A6, 0xB396A7B5, 0x2A3E4081, 0x5DA6E292, + 0x186F8FE8, 0x6FF72DFB, 0xF65FCACF, 0x81C768DC, + 0xFFDF5F7B, 0x8847FD68, 0x11EF1A5C, 0x6677B84F, + 0x23BED535, 0x54267726, 0xCD8E9012, 0xBA163201, + 0x471C4BE6, 0x3084E9F5, 0xA92C0EC1, 0xDEB4ACD2, + 0x9B7DC1A8, 0xECE563BB, 0x754D848F, 0x02D5269C, + 0x7E2E9A44, 0x09B63857, 0x901EDF63, 0xE7867D70, + 0xA24F100A, 0xD5D7B219, 0x4C7F552D, 0x3BE7F73E, + 0xC6ED8ED9, 0xB1752CCA, 0x28DDCBFE, 0x5F4569ED, + 0x1A8C0497, 0x6D14A684, 0xF4BC41B0, 0x8324E3A3, + }, + { + 0x00000000, 0x7E9241A5, 0x0D526F4F, 0x73C02EEA, + 0x1AA4DE9E, 0x64369F3B, 0x17F6B1D1, 0x6964F074, + 0xC53E5138, 0xBBAC109D, 0xC86C3E77, 0xB6FE7FD2, + 0xDF9A8FA6, 0xA108CE03, 0xD2C8E0E9, 0xAC5AA14C, + 0x8A7DA270, 0xF4EFE3D5, 0x872FCD3F, 0xF9BD8C9A, + 0x90D97CEE, 0xEE4B3D4B, 0x9D8B13A1, 0xE3195204, + 0x4F43F348, 0x31D1B2ED, 0x42119C07, 0x3C83DDA2, + 0x55E72DD6, 0x2B756C73, 0x58B54299, 0x2627033C, + 0x14FB44E1, 0x6A690544, 0x19A92BAE, 0x673B6A0B, + 0x0E5F9A7F, 0x70CDDBDA, 0x030DF530, 0x7D9FB495, + 0xD1C515D9, 0xAF57547C, 0xDC977A96, 0xA2053B33, + 0xCB61CB47, 0xB5F38AE2, 0xC633A408, 0xB8A1E5AD, + 0x9E86E691, 0xE014A734, 0x93D489DE, 0xED46C87B, + 0x8422380F, 0xFAB079AA, 0x89705740, 0xF7E216E5, + 0x5BB8B7A9, 0x252AF60C, 0x56EAD8E6, 0x28789943, + 0x411C6937, 0x3F8E2892, 0x4C4E0678, 0x32DC47DD, + 0xD98065C7, 0xA7122462, 0xD4D20A88, 0xAA404B2D, + 0xC324BB59, 0xBDB6FAFC, 0xCE76D416, 0xB0E495B3, + 0x1CBE34FF, 0x622C755A, 0x11EC5BB0, 0x6F7E1A15, + 0x061AEA61, 0x7888ABC4, 0x0B48852E, 0x75DAC48B, + 0x53FDC7B7, 0x2D6F8612, 0x5EAFA8F8, 0x203DE95D, + 0x49591929, 0x37CB588C, 0x440B7666, 0x3A9937C3, + 0x96C3968F, 0xE851D72A, 0x9B91F9C0, 0xE503B865, + 0x8C674811, 0xF2F509B4, 0x8135275E, 0xFFA766FB, + 0xCD7B2126, 0xB3E96083, 0xC0294E69, 0xBEBB0FCC, + 0xD7DFFFB8, 0xA94DBE1D, 0xDA8D90F7, 0xA41FD152, + 0x0845701E, 0x76D731BB, 0x05171F51, 0x7B855EF4, + 0x12E1AE80, 0x6C73EF25, 0x1FB3C1CF, 0x6121806A, + 0x47068356, 0x3994C2F3, 0x4A54EC19, 0x34C6ADBC, + 0x5DA25DC8, 0x23301C6D, 0x50F03287, 0x2E627322, + 0x8238D26E, 0xFCAA93CB, 0x8F6ABD21, 0xF1F8FC84, + 0x989C0CF0, 0xE60E4D55, 0x95CE63BF, 0xEB5C221A, + 0x4377278B, 0x3DE5662E, 0x4E2548C4, 0x30B70961, + 0x59D3F915, 0x2741B8B0, 0x5481965A, 0x2A13D7FF, + 0x864976B3, 0xF8DB3716, 0x8B1B19FC, 0xF5895859, + 0x9CEDA82D, 0xE27FE988, 0x91BFC762, 0xEF2D86C7, + 0xC90A85FB, 0xB798C45E, 0xC458EAB4, 0xBACAAB11, + 0xD3AE5B65, 0xAD3C1AC0, 0xDEFC342A, 0xA06E758F, + 0x0C34D4C3, 0x72A69566, 0x0166BB8C, 0x7FF4FA29, + 0x16900A5D, 0x68024BF8, 0x1BC26512, 0x655024B7, + 0x578C636A, 0x291E22CF, 0x5ADE0C25, 0x244C4D80, + 0x4D28BDF4, 0x33BAFC51, 0x407AD2BB, 0x3EE8931E, + 0x92B23252, 0xEC2073F7, 0x9FE05D1D, 0xE1721CB8, + 0x8816ECCC, 0xF684AD69, 0x85448383, 0xFBD6C226, + 0xDDF1C11A, 0xA36380BF, 0xD0A3AE55, 0xAE31EFF0, + 0xC7551F84, 0xB9C75E21, 0xCA0770CB, 0xB495316E, + 0x18CF9022, 0x665DD187, 0x159DFF6D, 0x6B0FBEC8, + 0x026B4EBC, 0x7CF90F19, 0x0F3921F3, 0x71AB6056, + 0x9AF7424C, 0xE46503E9, 0x97A52D03, 0xE9376CA6, + 0x80539CD2, 0xFEC1DD77, 0x8D01F39D, 0xF393B238, + 0x5FC91374, 0x215B52D1, 0x529B7C3B, 0x2C093D9E, + 0x456DCDEA, 0x3BFF8C4F, 0x483FA2A5, 0x36ADE300, + 0x108AE03C, 0x6E18A199, 0x1DD88F73, 0x634ACED6, + 0x0A2E3EA2, 0x74BC7F07, 0x077C51ED, 0x79EE1048, + 0xD5B4B104, 0xAB26F0A1, 0xD8E6DE4B, 0xA6749FEE, + 0xCF106F9A, 0xB1822E3F, 0xC24200D5, 0xBCD04170, + 0x8E0C06AD, 0xF09E4708, 0x835E69E2, 0xFDCC2847, + 0x94A8D833, 0xEA3A9996, 0x99FAB77C, 0xE768F6D9, + 0x4B325795, 0x35A01630, 0x466038DA, 0x38F2797F, + 0x5196890B, 0x2F04C8AE, 0x5CC4E644, 0x2256A7E1, + 0x0471A4DD, 0x7AE3E578, 0x0923CB92, 0x77B18A37, + 0x1ED57A43, 0x60473BE6, 0x1387150C, 0x6D1554A9, + 0xC14FF5E5, 0xBFDDB440, 0xCC1D9AAA, 0xB28FDB0F, + 0xDBEB2B7B, 0xA5796ADE, 0xD6B94434, 0xA82B0591, + }, + { + 0x00000000, 0xB8AA45DD, 0x812367BF, 0x39892262, + 0xF331227B, 0x4B9B67A6, 0x721245C4, 0xCAB80019, + 0xE66344F6, 0x5EC9012B, 0x67402349, 0xDFEA6694, + 0x1552668D, 0xADF82350, 0x94710132, 0x2CDB44EF, + 0x3DB164E9, 0x851B2134, 0xBC920356, 0x0438468B, + 0xCE804692, 0x762A034F, 0x4FA3212D, 0xF70964F0, + 0xDBD2201F, 0x637865C2, 0x5AF147A0, 0xE25B027D, + 0x28E30264, 0x904947B9, 0xA9C065DB, 0x116A2006, + 0x8B1425D7, 0x33BE600A, 0x0A374268, 0xB29D07B5, + 0x782507AC, 0xC08F4271, 0xF9066013, 0x41AC25CE, + 0x6D776121, 0xD5DD24FC, 0xEC54069E, 0x54FE4343, + 0x9E46435A, 0x26EC0687, 0x1F6524E5, 0xA7CF6138, + 0xB6A5413E, 0x0E0F04E3, 0x37862681, 0x8F2C635C, + 0x45946345, 0xFD3E2698, 0xC4B704FA, 0x7C1D4127, + 0x50C605C8, 0xE86C4015, 0xD1E56277, 0x694F27AA, + 0xA3F727B3, 0x1B5D626E, 0x22D4400C, 0x9A7E05D1, + 0xE75FA6AB, 0x5FF5E376, 0x667CC114, 0xDED684C9, + 0x146E84D0, 0xACC4C10D, 0x954DE36F, 0x2DE7A6B2, + 0x013CE25D, 0xB996A780, 0x801F85E2, 0x38B5C03F, + 0xF20DC026, 0x4AA785FB, 0x732EA799, 0xCB84E244, + 0xDAEEC242, 0x6244879F, 0x5BCDA5FD, 0xE367E020, + 0x29DFE039, 0x9175A5E4, 0xA8FC8786, 0x1056C25B, + 0x3C8D86B4, 0x8427C369, 0xBDAEE10B, 0x0504A4D6, + 0xCFBCA4CF, 0x7716E112, 0x4E9FC370, 0xF63586AD, + 0x6C4B837C, 0xD4E1C6A1, 0xED68E4C3, 0x55C2A11E, + 0x9F7AA107, 0x27D0E4DA, 0x1E59C6B8, 0xA6F38365, + 0x8A28C78A, 0x32828257, 0x0B0BA035, 0xB3A1E5E8, + 0x7919E5F1, 0xC1B3A02C, 0xF83A824E, 0x4090C793, + 0x51FAE795, 0xE950A248, 0xD0D9802A, 0x6873C5F7, + 0xA2CBC5EE, 0x1A618033, 0x23E8A251, 0x9B42E78C, + 0xB799A363, 0x0F33E6BE, 0x36BAC4DC, 0x8E108101, + 0x44A88118, 0xFC02C4C5, 0xC58BE6A7, 0x7D21A37A, + 0x3FC9A052, 0x8763E58F, 0xBEEAC7ED, 0x06408230, + 0xCCF88229, 0x7452C7F4, 0x4DDBE596, 0xF571A04B, + 0xD9AAE4A4, 0x6100A179, 0x5889831B, 0xE023C6C6, + 0x2A9BC6DF, 0x92318302, 0xABB8A160, 0x1312E4BD, + 0x0278C4BB, 0xBAD28166, 0x835BA304, 0x3BF1E6D9, + 0xF149E6C0, 0x49E3A31D, 0x706A817F, 0xC8C0C4A2, + 0xE41B804D, 0x5CB1C590, 0x6538E7F2, 0xDD92A22F, + 0x172AA236, 0xAF80E7EB, 0x9609C589, 0x2EA38054, + 0xB4DD8585, 0x0C77C058, 0x35FEE23A, 0x8D54A7E7, + 0x47ECA7FE, 0xFF46E223, 0xC6CFC041, 0x7E65859C, + 0x52BEC173, 0xEA1484AE, 0xD39DA6CC, 0x6B37E311, + 0xA18FE308, 0x1925A6D5, 0x20AC84B7, 0x9806C16A, + 0x896CE16C, 0x31C6A4B1, 0x084F86D3, 0xB0E5C30E, + 0x7A5DC317, 0xC2F786CA, 0xFB7EA4A8, 0x43D4E175, + 0x6F0FA59A, 0xD7A5E047, 0xEE2CC225, 0x568687F8, + 0x9C3E87E1, 0x2494C23C, 0x1D1DE05E, 0xA5B7A583, + 0xD89606F9, 0x603C4324, 0x59B56146, 0xE11F249B, + 0x2BA72482, 0x930D615F, 0xAA84433D, 0x122E06E0, + 0x3EF5420F, 0x865F07D2, 0xBFD625B0, 0x077C606D, + 0xCDC46074, 0x756E25A9, 0x4CE707CB, 0xF44D4216, + 0xE5276210, 0x5D8D27CD, 0x640405AF, 0xDCAE4072, + 0x1616406B, 0xAEBC05B6, 0x973527D4, 0x2F9F6209, + 0x034426E6, 0xBBEE633B, 0x82674159, 0x3ACD0484, + 0xF075049D, 0x48DF4140, 0x71566322, 0xC9FC26FF, + 0x5382232E, 0xEB2866F3, 0xD2A14491, 0x6A0B014C, + 0xA0B30155, 0x18194488, 0x219066EA, 0x993A2337, + 0xB5E167D8, 0x0D4B2205, 0x34C20067, 0x8C6845BA, + 0x46D045A3, 0xFE7A007E, 0xC7F3221C, 0x7F5967C1, + 0x6E3347C7, 0xD699021A, 0xEF102078, 0x57BA65A5, + 0x9D0265BC, 0x25A82061, 0x1C210203, 0xA48B47DE, + 0x88500331, 0x30FA46EC, 0x0973648E, 0xB1D92153, + 0x7B61214A, 0xC3CB6497, 0xFA4246F5, 0x42E80328, + }, + { + 0x00000000, 0xAC6F1138, 0x58DF2270, 0xF4B03348, + 0xB0BE45E0, 0x1CD154D8, 0xE8616790, 0x440E76A8, + 0x910B67C5, 0x3D6476FD, 0xC9D445B5, 0x65BB548D, + 0x21B52225, 0x8DDA331D, 0x796A0055, 0xD505116D, + 0xD361228F, 0x7F0E33B7, 0x8BBE00FF, 0x27D111C7, + 0x63DF676F, 0xCFB07657, 0x3B00451F, 0x976F5427, + 0x426A454A, 0xEE055472, 0x1AB5673A, 0xB6DA7602, + 0xF2D400AA, 0x5EBB1192, 0xAA0B22DA, 0x066433E2, + 0x57B5A81B, 0xFBDAB923, 0x0F6A8A6B, 0xA3059B53, + 0xE70BEDFB, 0x4B64FCC3, 0xBFD4CF8B, 0x13BBDEB3, + 0xC6BECFDE, 0x6AD1DEE6, 0x9E61EDAE, 0x320EFC96, + 0x76008A3E, 0xDA6F9B06, 0x2EDFA84E, 0x82B0B976, + 0x84D48A94, 0x28BB9BAC, 0xDC0BA8E4, 0x7064B9DC, + 0x346ACF74, 0x9805DE4C, 0x6CB5ED04, 0xC0DAFC3C, + 0x15DFED51, 0xB9B0FC69, 0x4D00CF21, 0xE16FDE19, + 0xA561A8B1, 0x090EB989, 0xFDBE8AC1, 0x51D19BF9, + 0xAE6A5137, 0x0205400F, 0xF6B57347, 0x5ADA627F, + 0x1ED414D7, 0xB2BB05EF, 0x460B36A7, 0xEA64279F, + 0x3F6136F2, 0x930E27CA, 0x67BE1482, 0xCBD105BA, + 0x8FDF7312, 0x23B0622A, 0xD7005162, 0x7B6F405A, + 0x7D0B73B8, 0xD1646280, 0x25D451C8, 0x89BB40F0, + 0xCDB53658, 0x61DA2760, 0x956A1428, 0x39050510, + 0xEC00147D, 0x406F0545, 0xB4DF360D, 0x18B02735, + 0x5CBE519D, 0xF0D140A5, 0x046173ED, 0xA80E62D5, + 0xF9DFF92C, 0x55B0E814, 0xA100DB5C, 0x0D6FCA64, + 0x4961BCCC, 0xE50EADF4, 0x11BE9EBC, 0xBDD18F84, + 0x68D49EE9, 0xC4BB8FD1, 0x300BBC99, 0x9C64ADA1, + 0xD86ADB09, 0x7405CA31, 0x80B5F979, 0x2CDAE841, + 0x2ABEDBA3, 0x86D1CA9B, 0x7261F9D3, 0xDE0EE8EB, + 0x9A009E43, 0x366F8F7B, 0xC2DFBC33, 0x6EB0AD0B, + 0xBBB5BC66, 0x17DAAD5E, 0xE36A9E16, 0x4F058F2E, + 0x0B0BF986, 0xA764E8BE, 0x53D4DBF6, 0xFFBBCACE, + 0x5CD5A26E, 0xF0BAB356, 0x040A801E, 0xA8659126, + 0xEC6BE78E, 0x4004F6B6, 0xB4B4C5FE, 0x18DBD4C6, + 0xCDDEC5AB, 0x61B1D493, 0x9501E7DB, 0x396EF6E3, + 0x7D60804B, 0xD10F9173, 0x25BFA23B, 0x89D0B303, + 0x8FB480E1, 0x23DB91D9, 0xD76BA291, 0x7B04B3A9, + 0x3F0AC501, 0x9365D439, 0x67D5E771, 0xCBBAF649, + 0x1EBFE724, 0xB2D0F61C, 0x4660C554, 0xEA0FD46C, + 0xAE01A2C4, 0x026EB3FC, 0xF6DE80B4, 0x5AB1918C, + 0x0B600A75, 0xA70F1B4D, 0x53BF2805, 0xFFD0393D, + 0xBBDE4F95, 0x17B15EAD, 0xE3016DE5, 0x4F6E7CDD, + 0x9A6B6DB0, 0x36047C88, 0xC2B44FC0, 0x6EDB5EF8, + 0x2AD52850, 0x86BA3968, 0x720A0A20, 0xDE651B18, + 0xD80128FA, 0x746E39C2, 0x80DE0A8A, 0x2CB11BB2, + 0x68BF6D1A, 0xC4D07C22, 0x30604F6A, 0x9C0F5E52, + 0x490A4F3F, 0xE5655E07, 0x11D56D4F, 0xBDBA7C77, + 0xF9B40ADF, 0x55DB1BE7, 0xA16B28AF, 0x0D043997, + 0xF2BFF359, 0x5ED0E261, 0xAA60D129, 0x060FC011, + 0x4201B6B9, 0xEE6EA781, 0x1ADE94C9, 0xB6B185F1, + 0x63B4949C, 0xCFDB85A4, 0x3B6BB6EC, 0x9704A7D4, + 0xD30AD17C, 0x7F65C044, 0x8BD5F30C, 0x27BAE234, + 0x21DED1D6, 0x8DB1C0EE, 0x7901F3A6, 0xD56EE29E, + 0x91609436, 0x3D0F850E, 0xC9BFB646, 0x65D0A77E, + 0xB0D5B613, 0x1CBAA72B, 0xE80A9463, 0x4465855B, + 0x006BF3F3, 0xAC04E2CB, 0x58B4D183, 0xF4DBC0BB, + 0xA50A5B42, 0x09654A7A, 0xFDD57932, 0x51BA680A, + 0x15B41EA2, 0xB9DB0F9A, 0x4D6B3CD2, 0xE1042DEA, + 0x34013C87, 0x986E2DBF, 0x6CDE1EF7, 0xC0B10FCF, + 0x84BF7967, 0x28D0685F, 0xDC605B17, 0x700F4A2F, + 0x766B79CD, 0xDA0468F5, 0x2EB45BBD, 0x82DB4A85, + 0xC6D53C2D, 0x6ABA2D15, 0x9E0A1E5D, 0x32650F65, + 0xE7601E08, 0x4B0F0F30, 0xBFBF3C78, 0x13D02D40, + 0x57DE5BE8, 0xFBB14AD0, 0x0F017998, 0xA36E68A0, + }, + { + 0x00000000, 0x196B30EF, 0xC3A08CDB, 0xDACBBC34, + 0x7737F5B2, 0x6E5CC55D, 0xB4977969, 0xADFC4986, + 0x1F180660, 0x0673368F, 0xDCB88ABB, 0xC5D3BA54, + 0x682FF3D2, 0x7144C33D, 0xAB8F7F09, 0xB2E44FE6, + 0x3E300CC0, 0x275B3C2F, 0xFD90801B, 0xE4FBB0F4, + 0x4907F972, 0x506CC99D, 0x8AA775A9, 0x93CC4546, + 0x21280AA0, 0x38433A4F, 0xE288867B, 0xFBE3B694, + 0x561FFF12, 0x4F74CFFD, 0x95BF73C9, 0x8CD44326, + 0x8D16F485, 0x947DC46A, 0x4EB6785E, 0x57DD48B1, + 0xFA210137, 0xE34A31D8, 0x39818DEC, 0x20EABD03, + 0x920EF2E5, 0x8B65C20A, 0x51AE7E3E, 0x48C54ED1, + 0xE5390757, 0xFC5237B8, 0x26998B8C, 0x3FF2BB63, + 0xB326F845, 0xAA4DC8AA, 0x7086749E, 0x69ED4471, + 0xC4110DF7, 0xDD7A3D18, 0x07B1812C, 0x1EDAB1C3, + 0xAC3EFE25, 0xB555CECA, 0x6F9E72FE, 0x76F54211, + 0xDB090B97, 0xC2623B78, 0x18A9874C, 0x01C2B7A3, + 0xEB5B040E, 0xF23034E1, 0x28FB88D5, 0x3190B83A, + 0x9C6CF1BC, 0x8507C153, 0x5FCC7D67, 0x46A74D88, + 0xF443026E, 0xED283281, 0x37E38EB5, 0x2E88BE5A, + 0x8374F7DC, 0x9A1FC733, 0x40D47B07, 0x59BF4BE8, + 0xD56B08CE, 0xCC003821, 0x16CB8415, 0x0FA0B4FA, + 0xA25CFD7C, 0xBB37CD93, 0x61FC71A7, 0x78974148, + 0xCA730EAE, 0xD3183E41, 0x09D38275, 0x10B8B29A, + 0xBD44FB1C, 0xA42FCBF3, 0x7EE477C7, 0x678F4728, + 0x664DF08B, 0x7F26C064, 0xA5ED7C50, 0xBC864CBF, + 0x117A0539, 0x081135D6, 0xD2DA89E2, 0xCBB1B90D, + 0x7955F6EB, 0x603EC604, 0xBAF57A30, 0xA39E4ADF, + 0x0E620359, 0x170933B6, 0xCDC28F82, 0xD4A9BF6D, + 0x587DFC4B, 0x4116CCA4, 0x9BDD7090, 0x82B6407F, + 0x2F4A09F9, 0x36213916, 0xECEA8522, 0xF581B5CD, + 0x4765FA2B, 0x5E0ECAC4, 0x84C576F0, 0x9DAE461F, + 0x30520F99, 0x29393F76, 0xF3F28342, 0xEA99B3AD, + 0xD6B7081C, 0xCFDC38F3, 0x151784C7, 0x0C7CB428, + 0xA180FDAE, 0xB8EBCD41, 0x62207175, 0x7B4B419A, + 0xC9AF0E7C, 0xD0C43E93, 0x0A0F82A7, 0x1364B248, + 0xBE98FBCE, 0xA7F3CB21, 0x7D387715, 0x645347FA, + 0xE88704DC, 0xF1EC3433, 0x2B278807, 0x324CB8E8, + 0x9FB0F16E, 0x86DBC181, 0x5C107DB5, 0x457B4D5A, + 0xF79F02BC, 0xEEF43253, 0x343F8E67, 0x2D54BE88, + 0x80A8F70E, 0x99C3C7E1, 0x43087BD5, 0x5A634B3A, + 0x5BA1FC99, 0x42CACC76, 0x98017042, 0x816A40AD, + 0x2C96092B, 0x35FD39C4, 0xEF3685F0, 0xF65DB51F, + 0x44B9FAF9, 0x5DD2CA16, 0x87197622, 0x9E7246CD, + 0x338E0F4B, 0x2AE53FA4, 0xF02E8390, 0xE945B37F, + 0x6591F059, 0x7CFAC0B6, 0xA6317C82, 0xBF5A4C6D, + 0x12A605EB, 0x0BCD3504, 0xD1068930, 0xC86DB9DF, + 0x7A89F639, 0x63E2C6D6, 0xB9297AE2, 0xA0424A0D, + 0x0DBE038B, 0x14D53364, 0xCE1E8F50, 0xD775BFBF, + 0x3DEC0C12, 0x24873CFD, 0xFE4C80C9, 0xE727B026, + 0x4ADBF9A0, 0x53B0C94F, 0x897B757B, 0x90104594, + 0x22F40A72, 0x3B9F3A9D, 0xE15486A9, 0xF83FB646, + 0x55C3FFC0, 0x4CA8CF2F, 0x9663731B, 0x8F0843F4, + 0x03DC00D2, 0x1AB7303D, 0xC07C8C09, 0xD917BCE6, + 0x74EBF560, 0x6D80C58F, 0xB74B79BB, 0xAE204954, + 0x1CC406B2, 0x05AF365D, 0xDF648A69, 0xC60FBA86, + 0x6BF3F300, 0x7298C3EF, 0xA8537FDB, 0xB1384F34, + 0xB0FAF897, 0xA991C878, 0x735A744C, 0x6A3144A3, + 0xC7CD0D25, 0xDEA63DCA, 0x046D81FE, 0x1D06B111, + 0xAFE2FEF7, 0xB689CE18, 0x6C42722C, 0x752942C3, + 0xD8D50B45, 0xC1BE3BAA, 0x1B75879E, 0x021EB771, + 0x8ECAF457, 0x97A1C4B8, 0x4D6A788C, 0x54014863, + 0xF9FD01E5, 0xE096310A, 0x3A5D8D3E, 0x2336BDD1, + 0x91D2F237, 0x88B9C2D8, 0x52727EEC, 0x4B194E03, + 0xE6E50785, 0xFF8E376A, 0x25458B5E, 0x3C2EBBB1, + }, + { + 0x00000000, 0xC82C0368, 0x905906D0, 0x587505B8, + 0xD1C5E0A5, 0x19E9E3CD, 0x419CE675, 0x89B0E51D, + 0x53FD2D4E, 0x9BD12E26, 0xC3A42B9E, 0x0B8828F6, + 0x8238CDEB, 0x4A14CE83, 0x1261CB3B, 0xDA4DC853, + 0xA6FA5B9C, 0x6ED658F4, 0x36A35D4C, 0xFE8F5E24, + 0x773FBB39, 0xBF13B851, 0xE766BDE9, 0x2F4ABE81, + 0xF50776D2, 0x3D2B75BA, 0x655E7002, 0xAD72736A, + 0x24C29677, 0xECEE951F, 0xB49B90A7, 0x7CB793CF, + 0xBD835B3D, 0x75AF5855, 0x2DDA5DED, 0xE5F65E85, + 0x6C46BB98, 0xA46AB8F0, 0xFC1FBD48, 0x3433BE20, + 0xEE7E7673, 0x2652751B, 0x7E2770A3, 0xB60B73CB, + 0x3FBB96D6, 0xF79795BE, 0xAFE29006, 0x67CE936E, + 0x1B7900A1, 0xD35503C9, 0x8B200671, 0x430C0519, + 0xCABCE004, 0x0290E36C, 0x5AE5E6D4, 0x92C9E5BC, + 0x48842DEF, 0x80A82E87, 0xD8DD2B3F, 0x10F12857, + 0x9941CD4A, 0x516DCE22, 0x0918CB9A, 0xC134C8F2, + 0x7A07B77A, 0xB22BB412, 0xEA5EB1AA, 0x2272B2C2, + 0xABC257DF, 0x63EE54B7, 0x3B9B510F, 0xF3B75267, + 0x29FA9A34, 0xE1D6995C, 0xB9A39CE4, 0x718F9F8C, + 0xF83F7A91, 0x301379F9, 0x68667C41, 0xA04A7F29, + 0xDCFDECE6, 0x14D1EF8E, 0x4CA4EA36, 0x8488E95E, + 0x0D380C43, 0xC5140F2B, 0x9D610A93, 0x554D09FB, + 0x8F00C1A8, 0x472CC2C0, 0x1F59C778, 0xD775C410, + 0x5EC5210D, 0x96E92265, 0xCE9C27DD, 0x06B024B5, + 0xC784EC47, 0x0FA8EF2F, 0x57DDEA97, 0x9FF1E9FF, + 0x16410CE2, 0xDE6D0F8A, 0x86180A32, 0x4E34095A, + 0x9479C109, 0x5C55C261, 0x0420C7D9, 0xCC0CC4B1, + 0x45BC21AC, 0x8D9022C4, 0xD5E5277C, 0x1DC92414, + 0x617EB7DB, 0xA952B4B3, 0xF127B10B, 0x390BB263, + 0xB0BB577E, 0x78975416, 0x20E251AE, 0xE8CE52C6, + 0x32839A95, 0xFAAF99FD, 0xA2DA9C45, 0x6AF69F2D, + 0xE3467A30, 0x2B6A7958, 0x731F7CE0, 0xBB337F88, + 0xF40E6EF5, 0x3C226D9D, 0x64576825, 0xAC7B6B4D, + 0x25CB8E50, 0xEDE78D38, 0xB5928880, 0x7DBE8BE8, + 0xA7F343BB, 0x6FDF40D3, 0x37AA456B, 0xFF864603, + 0x7636A31E, 0xBE1AA076, 0xE66FA5CE, 0x2E43A6A6, + 0x52F43569, 0x9AD83601, 0xC2AD33B9, 0x0A8130D1, + 0x8331D5CC, 0x4B1DD6A4, 0x1368D31C, 0xDB44D074, + 0x01091827, 0xC9251B4F, 0x91501EF7, 0x597C1D9F, + 0xD0CCF882, 0x18E0FBEA, 0x4095FE52, 0x88B9FD3A, + 0x498D35C8, 0x81A136A0, 0xD9D43318, 0x11F83070, + 0x9848D56D, 0x5064D605, 0x0811D3BD, 0xC03DD0D5, + 0x1A701886, 0xD25C1BEE, 0x8A291E56, 0x42051D3E, + 0xCBB5F823, 0x0399FB4B, 0x5BECFEF3, 0x93C0FD9B, + 0xEF776E54, 0x275B6D3C, 0x7F2E6884, 0xB7026BEC, + 0x3EB28EF1, 0xF69E8D99, 0xAEEB8821, 0x66C78B49, + 0xBC8A431A, 0x74A64072, 0x2CD345CA, 0xE4FF46A2, + 0x6D4FA3BF, 0xA563A0D7, 0xFD16A56F, 0x353AA607, + 0x8E09D98F, 0x4625DAE7, 0x1E50DF5F, 0xD67CDC37, + 0x5FCC392A, 0x97E03A42, 0xCF953FFA, 0x07B93C92, + 0xDDF4F4C1, 0x15D8F7A9, 0x4DADF211, 0x8581F179, + 0x0C311464, 0xC41D170C, 0x9C6812B4, 0x544411DC, + 0x28F38213, 0xE0DF817B, 0xB8AA84C3, 0x708687AB, + 0xF93662B6, 0x311A61DE, 0x696F6466, 0xA143670E, + 0x7B0EAF5D, 0xB322AC35, 0xEB57A98D, 0x237BAAE5, + 0xAACB4FF8, 0x62E74C90, 0x3A924928, 0xF2BE4A40, + 0x338A82B2, 0xFBA681DA, 0xA3D38462, 0x6BFF870A, + 0xE24F6217, 0x2A63617F, 0x721664C7, 0xBA3A67AF, + 0x6077AFFC, 0xA85BAC94, 0xF02EA92C, 0x3802AA44, + 0xB1B24F59, 0x799E4C31, 0x21EB4989, 0xE9C74AE1, + 0x9570D92E, 0x5D5CDA46, 0x0529DFFE, 0xCD05DC96, + 0x44B5398B, 0x8C993AE3, 0xD4EC3F5B, 0x1CC03C33, + 0xC68DF460, 0x0EA1F708, 0x56D4F2B0, 0x9EF8F1D8, + 0x174814C5, 0xDF6417AD, 0x87111215, 0x4F3D117D, + }, + { + 0x00000000, 0x277D3C49, 0x4EFA7892, 0x698744DB, + 0x6D821D21, 0x4AFF2168, 0x237865B3, 0x040559FA, + 0xDA043B42, 0xFD79070B, 0x94FE43D0, 0xB3837F99, + 0xB7862663, 0x90FB1A2A, 0xF97C5EF1, 0xDE0162B8, + 0xB4097684, 0x93744ACD, 0xFAF30E16, 0xDD8E325F, + 0xD98B6BA5, 0xFEF657EC, 0x97711337, 0xB00C2F7E, + 0x6E0D4DC6, 0x4970718F, 0x20F73554, 0x078A091D, + 0x038F50E7, 0x24F26CAE, 0x4D752875, 0x6A08143C, + 0x9965000D, 0xBE183C44, 0xD79F789F, 0xF0E244D6, + 0xF4E71D2C, 0xD39A2165, 0xBA1D65BE, 0x9D6059F7, + 0x43613B4F, 0x641C0706, 0x0D9B43DD, 0x2AE67F94, + 0x2EE3266E, 0x099E1A27, 0x60195EFC, 0x476462B5, + 0x2D6C7689, 0x0A114AC0, 0x63960E1B, 0x44EB3252, + 0x40EE6BA8, 0x679357E1, 0x0E14133A, 0x29692F73, + 0xF7684DCB, 0xD0157182, 0xB9923559, 0x9EEF0910, + 0x9AEA50EA, 0xBD976CA3, 0xD4102878, 0xF36D1431, + 0x32CB001A, 0x15B63C53, 0x7C317888, 0x5B4C44C1, + 0x5F491D3B, 0x78342172, 0x11B365A9, 0x36CE59E0, + 0xE8CF3B58, 0xCFB20711, 0xA63543CA, 0x81487F83, + 0x854D2679, 0xA2301A30, 0xCBB75EEB, 0xECCA62A2, + 0x86C2769E, 0xA1BF4AD7, 0xC8380E0C, 0xEF453245, + 0xEB406BBF, 0xCC3D57F6, 0xA5BA132D, 0x82C72F64, + 0x5CC64DDC, 0x7BBB7195, 0x123C354E, 0x35410907, + 0x314450FD, 0x16396CB4, 0x7FBE286F, 0x58C31426, + 0xABAE0017, 0x8CD33C5E, 0xE5547885, 0xC22944CC, + 0xC62C1D36, 0xE151217F, 0x88D665A4, 0xAFAB59ED, + 0x71AA3B55, 0x56D7071C, 0x3F5043C7, 0x182D7F8E, + 0x1C282674, 0x3B551A3D, 0x52D25EE6, 0x75AF62AF, + 0x1FA77693, 0x38DA4ADA, 0x515D0E01, 0x76203248, + 0x72256BB2, 0x555857FB, 0x3CDF1320, 0x1BA22F69, + 0xC5A34DD1, 0xE2DE7198, 0x8B593543, 0xAC24090A, + 0xA82150F0, 0x8F5C6CB9, 0xE6DB2862, 0xC1A6142B, + 0x64960134, 0x43EB3D7D, 0x2A6C79A6, 0x0D1145EF, + 0x09141C15, 0x2E69205C, 0x47EE6487, 0x609358CE, + 0xBE923A76, 0x99EF063F, 0xF06842E4, 0xD7157EAD, + 0xD3102757, 0xF46D1B1E, 0x9DEA5FC5, 0xBA97638C, + 0xD09F77B0, 0xF7E24BF9, 0x9E650F22, 0xB918336B, + 0xBD1D6A91, 0x9A6056D8, 0xF3E71203, 0xD49A2E4A, + 0x0A9B4CF2, 0x2DE670BB, 0x44613460, 0x631C0829, + 0x671951D3, 0x40646D9A, 0x29E32941, 0x0E9E1508, + 0xFDF30139, 0xDA8E3D70, 0xB30979AB, 0x947445E2, + 0x90711C18, 0xB70C2051, 0xDE8B648A, 0xF9F658C3, + 0x27F73A7B, 0x008A0632, 0x690D42E9, 0x4E707EA0, + 0x4A75275A, 0x6D081B13, 0x048F5FC8, 0x23F26381, + 0x49FA77BD, 0x6E874BF4, 0x07000F2F, 0x207D3366, + 0x24786A9C, 0x030556D5, 0x6A82120E, 0x4DFF2E47, + 0x93FE4CFF, 0xB48370B6, 0xDD04346D, 0xFA790824, + 0xFE7C51DE, 0xD9016D97, 0xB086294C, 0x97FB1505, + 0x565D012E, 0x71203D67, 0x18A779BC, 0x3FDA45F5, + 0x3BDF1C0F, 0x1CA22046, 0x7525649D, 0x525858D4, + 0x8C593A6C, 0xAB240625, 0xC2A342FE, 0xE5DE7EB7, + 0xE1DB274D, 0xC6A61B04, 0xAF215FDF, 0x885C6396, + 0xE25477AA, 0xC5294BE3, 0xACAE0F38, 0x8BD33371, + 0x8FD66A8B, 0xA8AB56C2, 0xC12C1219, 0xE6512E50, + 0x38504CE8, 0x1F2D70A1, 0x76AA347A, 0x51D70833, + 0x55D251C9, 0x72AF6D80, 0x1B28295B, 0x3C551512, + 0xCF380123, 0xE8453D6A, 0x81C279B1, 0xA6BF45F8, + 0xA2BA1C02, 0x85C7204B, 0xEC406490, 0xCB3D58D9, + 0x153C3A61, 0x32410628, 0x5BC642F3, 0x7CBB7EBA, + 0x78BE2740, 0x5FC31B09, 0x36445FD2, 0x1139639B, + 0x7B3177A7, 0x5C4C4BEE, 0x35CB0F35, 0x12B6337C, + 0x16B36A86, 0x31CE56CF, 0x58491214, 0x7F342E5D, + 0xA1354CE5, 0x864870AC, 0xEFCF3477, 0xC8B2083E, + 0xCCB751C4, 0xEBCA6D8D, 0x824D2956, 0xA530151F + } +#endif /* WORDS_BIGENDIAN */ +}; + +/* Accumulate one input byte */ +#ifdef WORDS_BIGENDIAN +#define CRC8(x) crc32c_table[0][((crc >> 24) ^ (x)) & 0xFF] ^ (crc << 8) +#else +#define CRC8(x) crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8) +#endif + +uint32 +comp_crc32c_sb8(uint32 crc, const void *data, size_t len) +{ + const unsigned char *p = (const unsigned char *)data; + const uint32 *p4; + + /* + * Handle 0-3 initial bytes one at a time, so that the loop below starts + * with a pointer aligned to four bytes. + */ + while (len > 0 && ((uintptr_t) p & 3)) + { + crc = CRC8(*p++); + len--; + } + + /* + * Process eight bytes of data at a time. + */ + p4 = (const uint32 *) p; + while (len >= 8) + { + uint32 a = *p4++ ^ crc; + uint32 b = *p4++; + +#ifdef WORDS_BIGENDIAN + const uint8 c0 = b; + const uint8 c1 = b >> 8; + const uint8 c2 = b >> 16; + const uint8 c3 = b >> 24; + const uint8 c4 = a; + const uint8 c5 = a >> 8; + const uint8 c6 = a >> 16; + const uint8 c7 = a >> 24; +#else + const uint8 c0 = b >> 24; + const uint8 c1 = b >> 16; + const uint8 c2 = b >> 8; + const uint8 c3 = b; + const uint8 c4 = a >> 24; + const uint8 c5 = a >> 16; + const uint8 c6 = a >> 8; + const uint8 c7 = a; +#endif + + crc = + crc32c_table[0][c0] ^ crc32c_table[1][c1] ^ + crc32c_table[2][c2] ^ crc32c_table[3][c3] ^ + crc32c_table[4][c4] ^ crc32c_table[5][c5] ^ + crc32c_table[6][c6] ^ crc32c_table[7][c7]; + + len -= 8; + } + + /* + * Handle any remaining bytes one at a time. + */ + p = (const unsigned char *) p4; + while (len > 0) + { + crc = CRC8(*p++); + len--; + } + + return crc; +} + +/* + * error handle + */ +#ifdef __cplusplus +extern "C" { +#endif + +static thread_local ICError ic_error = {LEVEL_ERROR}; + +ICError* +GetLastError() +{ + return &ic_error; +} + +void +ResetLastError() +{ + ic_error.level = LEVEL_OK; +} + +void +SetLastError(ErrorLevel level, const char *msg) +{ + ic_error.level = level; + strncpy(ic_error.msg, msg, sizeof(ic_error.msg) - 1); + ic_error.msg[sizeof(ic_error.msg) - 1] = 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif \ No newline at end of file diff --git a/contrib/udp2/ic_common/ic_utility.hpp b/contrib/udp2/ic_common/ic_utility.hpp new file mode 100644 index 00000000000..b8c8919ff12 --- /dev/null +++ b/contrib/udp2/ic_common/ic_utility.hpp @@ -0,0 +1,167 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_utility.hpp + * + * IDENTIFICATION + * contrib/udp2/ic_common/ic_utility.hpp + * + *------------------------------------------------------------------------- + */ +#ifndef IC_UTILITY_HPP +#define IC_UTILITY_HPP + +#include +#include +#include +#include + +#include +#include +#include + +/* Define this if you want tons of logs! */ +#undef AMS_VERBOSE_LOGGING + +#ifndef NDEBUG +#define USE_ASSERT_CHECKING 1 +#endif + +#define Assert(p) assert(p) +#define closesocket close + +#define IC_DEBUG5 10 +#define IC_DEBUG4 11 +#define IC_DEBUG3 12 +#define IC_DEBUG2 13 +#define IC_DEBUG1 14 +#define IC_LOG 15 +#define IC_WARNING 19 + +/* tupchunk.h */ +#define ANY_ROUTE -100 + +/* cdbvars.h */ +#define DEFAULT_PACKET_SIZE 8192 +#define MIN_PACKET_SIZE 512 +#define MAX_PACKET_SIZE 65507 /* Max payload for IPv4/UDP (subtract 20 more for IPv6 without extensions) */ +#define UNDEF_SEGMENT -2 + +/* c.h */ +#define Max(x, y) ((x) > (y) ? (x) : (y)) +#define Min(x, y) ((x) < (y) ? (x) : (y)) + +/* cdbinterconnect.h */ +#define CTS_INITIAL_SIZE (10) + +/* pg_list.h */ +#define NIL ((List *) NULL) + +/* transam.h */ +#define InvalidTransactionId (0) + +#define INVALID_SOCKET (-1) + +/* + * CONTAINER_OF + */ +#define CONTAINER_OF(ptr, type, member) \ + ( \ + reinterpret_cast(reinterpret_cast(ptr) - offsetof(type, member)) \ + ) + + +typedef enum GpVars_Verbosity_IC +{ + GPVARS_VERBOSITY_UNDEFINED_IC = 0, + GPVARS_VERBOSITY_OFF_IC, + GPVARS_VERBOSITY_TERSE_IC, + GPVARS_VERBOSITY_VERBOSE_IC, + GPVARS_VERBOSITY_DEBUG_IC, +} GpVars_Verbosity_IC; + +typedef enum GpVars_Interconnect_Method_IC +{ + INTERCONNECT_FC_METHOD_CAPACITY_IC = 0, + INTERCONNECT_FC_METHOD_LOSS_IC = 2, + INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC = 3, + INTERCONNECT_FC_METHOD_LOSS_TIMER_IC = 4, +} GpVars_Interconnect_Method_IC; + +typedef enum +{ + GP_ROLE_UNDEFINED_IC = 0, /* Should never see this role in use */ + GP_ROLE_UTILITY_IC, /* Operating as a simple database engine */ + GP_ROLE_DISPATCH_IC, /* Operating as the parallel query dispatcher */ + GP_ROLE_EXECUTE_IC, /* Operating as a parallel query executor */ +} GpRoleValue_IC; + +typedef enum GpVars_Interconnect_Address_Type_IC +{ + INTERCONNECT_ADDRESS_TYPE_UNICAST_IC = 0, + INTERCONNECT_ADDRESS_TYPE_WILDCARD_IC +} GpVars_Interconnect_Address_Type_IC; + + +/* + * global_param and session_param; + */ +extern GlobalMotionLayerIPCParam global_param; +extern SessionMotionLayerIPCParam session_param; + +/* + * logger stuff + */ +#define DEFAULT_LOG_LEVEL INFO +extern const char * SeverityName[9]; + +enum LogSeverity { + FATAL, LOG_ERROR, WARNING, INFO, DEBUG1, DEBUG2, DEBUG3, DEBUG4, DEBUG5 +}; + +class Logger; + +class Logger { +public: + Logger(); + + ~Logger(); + + void setOutputFd(int f); + + void setLogSeverity(LogSeverity l); + + void printf(LogSeverity s, const char * fmt, ...) __attribute__((format(printf, 3, 4))); + +private: + int fd; + std::atomic severity; +}; + +extern Logger RootLogger; + +#define LOG(s, fmt, ...) \ + RootLogger.printf(s, fmt, ##__VA_ARGS__) + +/* + * crc32 + */ +extern uint32 ComputeCRC(const void *data, size_t len); + +#endif // IC_UTILITY_HPP \ No newline at end of file diff --git a/contrib/udp2/ic_common/udp2/ic_udp2.cpp b/contrib/udp2/ic_common/udp2/ic_udp2.cpp new file mode 100644 index 00000000000..a747d91f600 --- /dev/null +++ b/contrib/udp2/ic_common/udp2/ic_udp2.cpp @@ -0,0 +1,7649 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_udp2.cpp + * + * IDENTIFICATION + * contrib/udp2/ic_common/udp2/ic_udp2.cpp + * + * +--------------+ + * | ic_types.h | + * +--------------+ + * / \ + * +--------------+ +---------------+ + * | C interface | | C++ interface | + * | ic_udp2.h | | ic_udp2.hpp | + * +--------------+ +---------------+ + * \ / + * +----------------------+ + * | C++ implement | + * | ic_udp2_internal.hpp| + * | ic_faultinjection.h | + * | ic_udp2.cpp | + * +----------------------+ + *------------------------------------------------------------------------- + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * interface header files + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include "ic_types.h" +#include "ic_udp2.h" + +#ifdef __cplusplus +} +#endif + +#include "ic_udp2.hpp" + +/* + * internal header files + */ +#include "ic_utility.hpp" +#include "ic_udp2_internal.hpp" +#include "ic_faultinjection.h" + +/* + * Hints to the compiler about the likelihood of a branch. Both likely() and + * unlikely() return the boolean value of the contained expression. + * + * These should only be used sparingly, in very hot code paths. It's very easy + * to mis-estimate likelihoods. + */ +#if __GNUC__ >= 3 +#define likely(x) __builtin_expect((x) != 0, 1) +#define unlikely(x) __builtin_expect((x) != 0, 0) +#else +#define likely(x) ((x) != 0) +#define unlikely(x) ((x) != 0) +#endif + +static int timeoutArray[] = +{ + 1, + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 512 /* MAX_TRY */ +}; + +/* + * Main thread (Receiver) and background thread use the information in + * this data structure to handle data packets. + */ +static ReceiveControlInfo rx_control_info; + +/* + * The buffer pool used for keeping data packets. + * + * maxCount is set to 1 to make sure there is always a buffer + * for picking packets from OS buffer. + */ +static RxBufferPool rx_buffer_pool = {1, 0, NULL}; + +/* + * The sender side buffer pool. + */ +static SendBufferPool snd_buffer_pool; + +/* + * Main thread use the information in this data structure to do ack handling + * and congestion control. + */ +static SendControlInfo snd_control_info; + +/* + * Shared control information that is used by senders, receivers and background thread. + */ +static ICGlobalControlInfo ic_control_info; + +/* + * All connections in a process share this unack queue ring instance. + */ +static UnackQueueRing unack_queue_ring = {0}; + +static int ICSenderSocket = -1; +static int32 ICSenderPort = 0; +static int ICSenderFamily = 0; + +/* Statistics for UDP interconnect. */ +static ICStatistics ic_statistics; + +/* Cached sockaddr of the listening udp socket */ +static struct sockaddr_storage udp_dummy_packet_sockaddr; + +/* UDP listen fd */ +static int UDP_listenerFd = -1; + +/* UDP listen port */ +static int32 udp_listener_port = 0; + +static std::mutex mtx; +static std::condition_variable cv; + +CChunkTransportState *CChunkTransportStateImpl::state_ = nullptr; + +static struct mudp_manager mudp; + +/* + * Identity the user of ic module by vector_engine_is_user: + * "false" means PG executor, "true" means Arrow executor. + */ +static thread_local bool vector_engine_is_user = false; +static thread_local bool thread_quit = false; + +#define CHECK_QUIT_FLAG() \ + do { \ + if (thread_quit) { \ + throw ICException("received thread quit flag.", __FILE__, __LINE__); \ + } \ + } while(0) + +#define CHECK_INTERRUPTS(state) \ + do { \ + if (vector_engine_is_user) { \ + CHECK_QUIT_FLAG(); \ + } else if (global_param.checkInterruptsCallback) { \ + global_param.checkInterruptsCallback((state)->teardownActive); \ + } \ + } while(0) + +#define CHECK_CANCEL(state) \ + do { \ + if (vector_engine_is_user) { \ + CHECK_QUIT_FLAG(); \ + } else if (global_param.checkCancelOnQDCallback) { \ + global_param.checkCancelOnQDCallback(state); \ + } \ + } while(0) + +#define CHECK_POSTMASTER_ALIVE() \ + do { \ + if (vector_engine_is_user) { \ + CHECK_QUIT_FLAG(); \ + } else if (global_param.checkPostmasterIsAliveCallback && !global_param.checkPostmasterIsAliveCallback()) { \ + throw ICFatalException("FATAL, interconnect failed to send chunks, Postmaster is not alive.", __FILE__, __LINE__); \ + } \ + } while(0) + +/*========================================================================= + * STATIC FUNCTIONS declarations + */ + +/* Background thread error handling functions. */ +static void checkRxThreadError(void); +static void setRxThreadError(int eno); +static void resetRxThreadError(void); + +static uint32 setUDPSocketBufferSize(int ic_socket, int buffer_type); +static void setupOutgoingUDPConnection(int icid, TransportEntry *pChunkEntry, UDPConn *conn); + +/* ICBufferList functions. */ +static inline void icBufferListInitHeadLink(ICBufferLink *link); + +static inline void InitMotionUDPIFC(int *listenerSocketFd, int32 *listenerPort); +static inline void CleanupMotionUDPIFC(void); + +static bool dispatcherAYT(void); +static void checkQDConnectionAlive(void); + +static void *rxThreadFunc(void *arg); + +static void putIntoUnackQueueRing(UnackQueueRing *uqr, ICBuffer *buf, uint64 expTime, uint64 now); + +static bool cacheFuturePacket(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len); +static void cleanupStartupCache(void); +static void handleCachedPackets(void); + +static uint64 getCurrentTime(void); +static void initMutex(pthread_mutex_t *mutex); + +static inline void logPkt(const char *prefix, icpkthdr *pkt); + +static void ConvertToIPv4MappedAddr(struct sockaddr_storage *sockaddr, socklen_t *o_len); +#if defined(__darwin__) +#define s6_addr32 __u6_addr.__u6_addr32 +static void ConvertIPv6WildcardToLoopback(struct sockaddr_storage* dest); +#endif + +static void setupUDPListeningSocket(int *listenerSocketFd, int32 *listenerPort, + int *txFamily, struct sockaddr_storage *listenerSockaddr); +static void getSockAddr(struct sockaddr_storage *peer, socklen_t *peer_len, const char *listenerAddr, int listenerPort); +static void SendDummyPacket(void); +static bool handleDataPacket(UDPConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, socklen_t *peerlen, AckSendParam *param, bool *wakeup_mainthread); +static bool handleMismatch(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len); +static void initUnackQueueRing(UnackQueueRing *uqr); + +static ssize_t sendtoWithRetry(int socket, const void *message, size_t length, int flags, const struct sockaddr *dest_addr, socklen_t dest_len, int retry, const char *errDetail); + +static char *format_sockaddr_udp(struct sockaddr_storage *sa, char *buf, size_t len); + +static void initUdpManager(mudp_manager_t mptr); + +static char* flags2txt(uint32 pkt_flags); + +static const char* flags_text[] = + {"recv2send", "ack", "stop", "eos", "nak", "disorder", "duplicate", "capacity"}; + +static char* +flags2txt(uint32 pkt_flags) +{ + thread_local static char flags[64]; + + char *p = flags; + *p = '\0'; + int bytes = 0; + for (size_t i = 0; i < sizeof(flags_text)/sizeof(flags_text[0]); ++i) { + if (pkt_flags & (1 << i)) + bytes += snprintf(p + bytes, 64, "%s | ", flags_text[i]); + } + + if (bytes > 0) + bytes -= 3; + *(p + bytes) = '\0'; + + return flags; +} + +/* + * CursorICHistoryTable::prune + * Prune entries in the hash table. + */ +void +CursorICHistoryTable::prune(uint32 icId) { + for (uint32 index = 0; index < size; index++) { + CursorICHistoryEntry *p = table[index], *q = NULL; + while (p) { + /* remove an entry if it is older than the prune-point */ + if (p->icId < icId) { + if (!q) + table[index] = p->next; + else + q->next = p->next; + + /* set up next loop */ + CursorICHistoryEntry *trash = p; + p = trash->next; + ic_free(trash); + + count--; + } else { + q = p; + p = p->next; + } + } + } +} + +#ifdef TRANSFER_PROTOCOL_STATS +typedef enum TransProtoEvent +{ + TPE_DATA_PKT_SEND, + TPE_ACK_PKT_QUERY +} TransProtoEvent; + +typedef struct TransProtoStatEntry TransProtoStatEntry; +struct TransProtoStatEntry +{ + TransProtoStatEntry *next; + + /* Basic information */ + uint32 time; + TransProtoEvent event; + int dstPid; + uint32 seq; + + /* more attributes can be added on demand. */ + + /* + * float cwnd; + * int capacity; + */ +}; + +typedef struct TransProtoStats TransProtoStats; +struct TransProtoStats +{ + std::mutex lock; + TransProtoStatEntry *head; + TransProtoStatEntry *tail; + uint64 count; + uint64 startTime; + + void init(); + void update(TransProtoEvent event, icpkthdr *pkt); + void dump(); +}; + +static TransProtoStats trans_proto_stats = +{ + {}, NULL, NULL, 0 +}; + +/* + * init + * Initialize the transport protocol states data structures. + */ +void +TransProtoStats::init() +{ + std::lock_guard guard(this->lock); + + while (this->head) { + TransProtoStatEntry *cur = this->head; + this->head = this->head->next; + ic_free(cur); + this->count--; + } + + this->head = NULL; + this->tail = NULL; + this->count = 0; + this->startTime = getCurrentTime(); +} + +void +TransProtoStats::update(TransProtoEvent event, icpkthdr *pkt) +{ + /* Add to list */ + TransProtoStatEntry *entry = (TransProtoStatEntry *) ic_malloc(sizeof(TransProtoStatEntry)); + if (!entry) + return; + + memset(entry, 0, sizeof(*entry)); + + /* change the list */ + std::lock_guard guard(this->lock); + if (this->count == 0) { + /* 1st element */ + this->head = entry; + this->tail = entry; + } else { + this->tail->next = entry; + this->tail = entry; + } + this->count++; + + entry->time = getCurrentTime() - this->startTime; + entry->event = event; + entry->dstPid = pkt->dstPid; + entry->seq = pkt->seq; + + /* + * Other attributes can be added on demand new->cwnd = + * snd_control_info.cwnd; new->capacity = conn->capacity; + */ +} + +void +TransProtoStats::dump() +{ + char tmpbuf[32]; + + snprintf(tmpbuf, 32, "%d.%lu.txt", global_param.MyProcPid, getCurrentTime()); + FILE *ofile = fopen(tmpbuf, "w+"); + + std::lock_guard guard(this->lock); + while (this->head) + { + TransProtoStatEntry *cur = NULL; + + cur = this->head; + this->head = this->head->next; + + fprintf(ofile, "time %d event %d seq %d destpid %d\n", cur->time, cur->event, cur->seq, cur->dstPid); + ic_free(cur); + this->count--; + } + + this->tail = NULL; + fclose(ofile); +} + +#endif /* TRANSFER_PROTOCOL_STATS */ + +/* + * initConnHashTable + * Initialize a connection hash table. + */ +bool +ConnHashTable::init() +{ + this->size = global_param.Gp_role == GP_ROLE_DISPATCH_IC ? + (global_param.segment_number * 2) : global_param.ic_htab_size; + Assert(this->size > 0); + + this->table = (struct ConnHtabBin **) ic_malloc(this->size * sizeof(struct ConnHtabBin *)); + if (this->table == NULL) + return false; + + for (int i = 0; i < this->size; i++) + this->table[i] = NULL; + + return true; +} + +/* + * connAddHash + * Add a connection to the hash table + * + * Note: we want to add a connection to the hashtable if it isn't + * already there ... so we just have to check the pointer values -- no + * need to use CONN_HASH_MATCH() at all! + */ +bool +ConnHashTable::add(UDPConn *conn) +{ + uint32 hashcode = CONN_HASH_VALUE(&conn->conn_info) % this->size; + + /* + * check for collision -- if we already have an entry for this connection, + * don't add another one. + */ + for (struct ConnHtabBin *bin = this->table[hashcode]; bin != NULL; bin = bin->next) + { + if (bin->conn == conn) + { + LOG(DEBUG5, "ConnHashTable::add(): duplicate ?! node %d route %d", conn->conn_info.motNodeId, conn->route); + return true; /* false *only* indicates memory-alloc + * failure. */ + } + } + + struct ConnHtabBin *newbin = (struct ConnHtabBin *) ic_malloc(sizeof(struct ConnHtabBin)); + if (newbin == NULL) + return false; + + newbin->conn = conn; + newbin->next = this->table[hashcode]; + this->table[hashcode] = newbin; + + ic_statistics.activeConnectionsNum++; + + return true; +} + +/* + * remove + * Delete a connection from the hash table + * + * Note: we want to remove a connection from the hashtable if it is + * there ... so we just have to check the pointer values -- no need to + * use CONN_HASH_MATCH() at all! + */ +void +ConnHashTable::remove(UDPConn *conn) +{ + uint32 hashcode; + struct ConnHtabBin *c, + *p, + *trash; + + hashcode = CONN_HASH_VALUE(&conn->conn_info) % this->size; + + c = this->table[hashcode]; + + /* find entry */ + p = NULL; + while (c != NULL) + { + /* found ? */ + if (c->conn == conn) + break; + + p = c; + c = c->next; + } + + /* not found ? */ + if (c == NULL) + { + return; + } + + /* found the connection, remove from the chain. */ + trash = c; + + if (p == NULL) + this->table[hashcode] = c->next; + else + p->next = c->next; + + ic_free(trash); + + ic_statistics.activeConnectionsNum--; + + return; +} + +/* + * findConnByHeader + * Find the corresponding connection given a pkt header information. + * + * With the new mirroring scheme, the interconnect is no longer involved: + * we don't have to disambiguate anymore. + * + * NOTE: the icpkthdr field dstListenerPort is used for disambiguation. + * on receivers it may not match the actual port (it may have an extra bit + * set (1<<31)). + */ +UDPConn * +ConnHashTable::find(icpkthdr *hdr) { + + uint32 hashcode = CONN_HASH_VALUE(hdr) % this->size; + for (struct ConnHtabBin *bin = this->table[hashcode]; bin != NULL; bin = bin->next) { + UDPConn *conn = bin->conn; + + if (CONN_HASH_MATCH(&conn->conn_info, hdr)) { + UDPConn *ret = conn; + if (IC_DEBUG5 >= session_param.log_min_messages) + LOG(DEBUG5, "ConnHashTable::find: found. route %d state %d hashcode %d conn %p", + conn->route, ret->state, hashcode, ret); + + return ret; + } + } + + if (IC_DEBUG5 >= session_param.log_min_messages) + LOG(DEBUG5, "ConnHashTable::find: not found! (hdr->srcPid %d hdr->srcContentId %d " + "hdr->dstContentId %d hdr->dstPid %d sess(%d:%d) cmd(%d:%d)) hashcode %d", + hdr->srcPid, hdr->srcContentId, hdr->dstContentId, hdr->dstPid, hdr->sessionId, + session_param.gp_session_id, hdr->icId, ic_control_info.ic_instance_id, hashcode); + + return NULL; +} + +void +ConnHashTable::destroy() { + for (int i = 0; i < this->size; i++) { + while (this->table[i] != NULL) { + struct ConnHtabBin *trash = this->table[i]; + this->table[i] = trash->next; + ic_free(trash); + } + } + + ic_free(this->table); + this->table = NULL; + this->size = 0; +} + +/* + * icBufferListInitHeadLink + * Initialize the pointers in the head link to point to itself. + */ +static inline void +icBufferListInitHeadLink(ICBufferLink *link) +{ + link->next = link->prev = link; +} + + +#if defined(USE_ASSERT_CHECKING) || defined(AMS_VERBOSE_LOGGING) + +/* + * icBufferListLog + * Log the buffer list. + */ +void +ICBufferList::icBufferListLog() +{ + LOG(INFO, "Length %d, type %d headptr %p", this->len, this->type, &this->head); + + ICBufferLink *bufLink = this->head.next; + + int len = this->len; + int i = 0; + + while (bufLink != &this->head && len > 0) + { + ICBuffer *buf = (this->type == ICBufferListType_Primary ? GET_ICBUFFER_FROM_PRIMARY(bufLink) + : GET_ICBUFFER_FROM_SECONDARY(bufLink)); + + LOG(INFO, "Node %d, linkptr %p", i++, bufLink); + + logPkt("from list", buf->pkt); + bufLink = bufLink->next; + len--; + } +} +#endif + +#ifdef USE_ASSERT_CHECKING +/* + * icBufferListCheck + * Buffer list sanity check. + */ +void +ICBufferList::icBufferListCheck(const char *prefix) +{ + int len = this->len; + ICBufferLink *link = this->head.next; + + if (len < 0) + { + LOG(LOG_ERROR, "ICBufferList ERROR %s: list length %d < 0 ", prefix, this->length()); + goto error; + } + + if (len == 0 && (this->head.prev != this->head.next && this->head.prev != &this->head)) + { + LOG(LOG_ERROR, "ICBufferList ERROR %s: length is 0, &list->head %p, prev %p, next %p", + prefix, &this->head, this->head.prev, this->head.next); + this->icBufferListLog(); + goto error; + } + + while (len > 0) + { + link = link->next; + len--; + } + + if (link != &this->head) + { + LOG(LOG_ERROR, "ICBufferList ERROR: %s len %d", prefix, this->len); + this->icBufferListLog(); + goto error; + } + + return; + +error: + LOG(INFO, "wait for 120s and then abort."); + ic_usleep(120000000); + abort(); +} +#endif + +/* + * ICBufferList::init + * Initialize the buffer list with the given type. + */ +void +ICBufferList::init(ICBufferListType atype) +{ + Assert(atype == ICBufferListType_Primary|| atype == ICBufferListType_Secondary); + + type = atype; + len = 0; + + icBufferListInitHeadLink(&head); + +#ifdef USE_ASSERT_CHECKING + this->icBufferListCheck("ICBufferList::init"); +#endif +} + +/* + * ICBufferList::is_head + * Return whether the given link is the head link of the list. + * + * This function is often used as the end condition of an iteration of the list. + */ +bool +ICBufferList::is_head(ICBufferLink *link) +{ +#ifdef USE_ASSERT_CHECKING + this->icBufferListCheck("ICBufferList::is_head"); +#endif + return (link == &head); +} + +/* + * ICBufferList::first + * Return the first link after the head link. + * + * Note that the head link is a pseudo link used to only to ease the operations of the link list. + * If the list only contains the head link, this function will return the head link. + */ +ICBufferLink * +ICBufferList::first() +{ +#ifdef USE_ASSERT_CHECKING + this->icBufferListCheck("ICBufferList::first"); +#endif + return head.next; +} + +/* + * ICBufferList::length + * Get the list length. + */ +int +ICBufferList::length() +{ + return len; +} + +/* + * ICBufferList::delete + * Remove an buffer from the buffer list and return the buffer. + */ +ICBuffer * +ICBufferList::remove(ICBuffer *buf) +{ +#ifdef USE_ASSERT_CHECKING + this->icBufferListCheck("ICBufferList::delete"); +#endif + + ICBufferLink *bufLink = NULL; + + bufLink = (this->type == ICBufferListType_Primary ? &buf->primary : &buf->secondary); + + bufLink->prev->next = bufLink->next; + bufLink->next->prev = bufLink->prev; + + len--; + + return buf; +} + +/* + * ICBufferList::pop + * Remove the head buffer from the list. + */ +ICBuffer * +ICBufferList::pop() +{ + ICBuffer *buf = NULL; + ICBufferLink *bufLink = NULL; + +#ifdef USE_ASSERT_CHECKING + this->icBufferListCheck("ICBufferList::pop"); +#endif + + if (this->len == 0) + return NULL; + + bufLink = this->first(); + buf = (this->type == ICBufferListType_Primary ? GET_ICBUFFER_FROM_PRIMARY(bufLink) + : GET_ICBUFFER_FROM_SECONDARY(bufLink)); + + bufLink->prev->next = bufLink->next; + bufLink->next->prev = bufLink->prev; + + this->len--; + + return buf; +} + +/* + * ICBufferList::free + * Free all the buffers in the list. + */ +void +ICBufferList::destroy() +{ + ICBuffer *buf = NULL; + +#ifdef USE_ASSERT_CHECKING + this->icBufferListCheck("ICBufferList::free"); +#endif + + while ((buf = this->pop()) != NULL) + ic_free(buf); +} + +/* + * ICBufferList::append + * Append a buffer to the tail of a double-link list. + */ +ICBuffer * +ICBufferList::append(ICBuffer *buf) +{ + Assert(buf); + +#ifdef USE_ASSERT_CHECKING + this->icBufferListCheck("ICBufferList::append"); +#endif + + ICBufferLink *bufLink = NULL; + + bufLink = (this->type == ICBufferListType_Primary ? &buf->primary : &buf->secondary); + + bufLink->prev = this->head.prev; + bufLink->next = &this->head; + + this->head.prev->next = bufLink; + this->head.prev = bufLink; + + this->len++; + + return buf; +} + +/* + * ICBufferList::return + * Return the buffers in the list to the free buffer list. + * + * If the buf is also in an expiration queue, we also need to remove it from the expiration queue. + * + */ +void +ICBufferList::release(bool inExpirationQueue) +{ +#ifdef USE_ASSERT_CHECKING + this->icBufferListCheck("ICBufferList::return"); +#endif + ICBuffer *buf = NULL; + + while ((buf = this->pop()) != NULL) + { + if (inExpirationQueue) /* the buf is in also in the expiration queue */ + { + ICBufferList *alist = &unack_queue_ring.slots[buf->unackQueueRingSlot]; + buf = alist->remove(buf); + unack_queue_ring.numOutStanding--; + if (this->length() >= 1) + unack_queue_ring.numSharedOutStanding--; + } + + snd_buffer_pool.freeList.append(buf); + } +} + +#ifdef USE_ASSERT_CHECKING +/* + * ICBufferList::dump_to_file + * Dump a buffer list. + */ +void +ICBufferList::dump_to_file(FILE *ofile) +{ + this->icBufferListCheck("ICBufferList::dump_to_file"); + + ICBufferLink *bufLink = this->head.next; + + int len = this->len; + int i = 0; + + fprintf(ofile, "List Length %d\n", len); + while (bufLink != &this->head && len > 0) + { + ICBuffer *buf = (this->type == ICBufferListType_Primary ? GET_ICBUFFER_FROM_PRIMARY(bufLink) + : GET_ICBUFFER_FROM_SECONDARY(bufLink)); + + fprintf(ofile, "Node %d, linkptr %p ", i++, bufLink); + fprintf(ofile, "Packet Content [%s: seq %d extraSeq %d]: motNodeId %d, crc %d len %d " + "srcContentId %d dstDesContentId %d " + "srcPid %d dstPid %d " + "srcListenerPort %d dstListernerPort %d " + "sendSliceIndex %d recvSliceIndex %d " + "sessionId %d icId %d " + "flags %d\n", + buf->pkt->flags & UDPIC_FLAGS_RECEIVER_TO_SENDER ? "ACK" : "DATA", + buf->pkt->seq, buf->pkt->extraSeq, buf->pkt->motNodeId, buf->pkt->crc, buf->pkt->len, + buf->pkt->srcContentId, buf->pkt->dstContentId, + buf->pkt->srcPid, buf->pkt->dstPid, + buf->pkt->srcListenerPort, buf->pkt->dstListenerPort, + buf->pkt->sendSliceIndex, buf->pkt->recvSliceIndex, + buf->pkt->sessionId, buf->pkt->icId, + buf->pkt->flags); + bufLink = bufLink->next; + len--; + } +} +#endif + +/* + * initUnackQueueRing + * Initialize an unack queue ring. + * + * Align current time to a slot boundary and set current slot index (time pointer) to 0. + */ +static void +initUnackQueueRing(UnackQueueRing *uqr) +{ + int i = 0; + + uqr->currentTime = 0; + uqr->idx = 0; + uqr->numOutStanding = 0; + uqr->numSharedOutStanding = 0; + + for (; i < UNACK_QUEUE_RING_SLOTS_NUM; i++) + { + uqr->slots[i].init(ICBufferListType_Secondary); + } + +#ifdef TIMEOUT_Z + uqr->retrans_count = 0; + uqr->no_retrans_count = 0; + uqr->time_difference = 0; + uqr->min = 0; + uqr->max = 0; +#endif +} + +/* + * RxBufferPool::get + * Get a receive buffer. + * + * SHOULD BE CALLED WITH ic_control_info.lock *LOCKED* + * + * NOTE: This function MUST NOT contain elog or ereport statements. + * elog is NOT thread-safe. Developers should instead use something like: + * + * NOTE: In threads, we cannot use palloc/pfree, because it's not thread safe. + */ +icpkthdr * +RxBufferPool::get() +{ + icpkthdr *ret = NULL; + +#ifdef USE_ASSERT_CHECKING + if (FINC_HAS_FAULT(FINC_RX_BUF_NULL) && + testmode_inject_fault(session_param.gp_udpic_fault_inject_percent)) + return NULL; +#endif + + do + { + if (this->freeList == NULL) + { + if (this->count > this->maxCount) + { + if (IC_DEBUG3 >= session_param.log_min_messages) + LOG(DEBUG3, "Interconnect ran out of rx-buffers count/max %d/%d", this->count, this->maxCount); + break; + } + + /* malloc is used for thread safty. */ + ret = (icpkthdr *) ic_malloc(global_param.Gp_max_packet_size); + + /* + * Note: we return NULL if the malloc() fails -- and the + * background thread will set the error. Main thread will check + * the error, report it and start teardown. + */ + if (ret != NULL) + this->count++; + + break; + } + + /* we have buffers available in our freelist */ + ret = this->get_free(); + + } while (0); + + return ret; +} + +/* + * RxBufferPool::put + * Return a receive buffer to free list + * + * SHOULD BE CALLED WITH ic_control_info.lock *LOCKED* + */ +void +RxBufferPool::put(icpkthdr *buf) +{ + /* return the buffer into the free list. */ + *(char **) buf = this->freeList; + this->freeList = (char *)buf; +} + +/* + * RxBufferPool::get_free + * Get a receive buffer from free list + * + * SHOULD BE CALLED WITH ic_control_info.lock *LOCKED* + * + * NOTE: This function MUST NOT contain elog or ereport statements. + * elog is NOT thread-safe. Developers should instead use something like: + * + * NOTE: In threads, we cannot use palloc/pfree, because it's not thread safe. + */ +icpkthdr * +RxBufferPool::get_free() +{ + icpkthdr *buf = NULL; + + buf = (icpkthdr *) this->freeList; + this->freeList = *(char **) (this->freeList); + return buf; +} + +/* + * RxBufferPool::free + * Free a receive buffer. + * + * NOTE: This function MUST NOT contain elog or ereport statements. + * elog is NOT thread-safe. Developers should instead use something like: + * + * NOTE: In threads, we cannot use palloc/pfree, because it's not thread safe. + */ +void +RxBufferPool::release(icpkthdr *buf) +{ + ic_free(buf); + count--; +} + +/* + * init + * Initialize the send buffer pool. + * + * The initial maxCount is set to 1 for gp_interconnect_snd_queue_depth = 1 case, + * then there is at least an extra free buffer to send for that case. + */ +void +SendBufferPool::init() +{ + this->freeList.init(ICBufferListType_Primary); + this->count = 0; + this->maxCount = (session_param.Gp_interconnect_snd_queue_depth == 1 ? 1 : 0); +} + +/* + * clean + * Clean the send buffer pool. + */ +void +SendBufferPool::clean() +{ + this->freeList.destroy(); + this->count = 0; + this->maxCount = 0; +} + +/* + * get + * Get a send buffer for a connection. + * + * Different flow control mechanisms use different buffer management policies. + * Capacity based flow control uses per-connection buffer policy and Loss based + * flow control uses shared buffer policy. + * + * Return NULL when no free buffer available. + */ +ICBuffer * +SendBufferPool::get(UDPConn *conn) +{ + ICBuffer *ret = NULL; + + ic_statistics.totalBuffers += (this->freeList.length() + this->maxCount - this->count); + ic_statistics.bufferCountingTime++; + + /* Capacity based flow control does not use shared buffers */ + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_CAPACITY_IC) + { + Assert(conn->unackQueue.length() + conn->sndQueue.length() <= session_param.Gp_interconnect_snd_queue_depth); + if (conn->unackQueue.length() + conn->sndQueue.length() >= session_param.Gp_interconnect_snd_queue_depth) + return NULL; + } + + if (this->freeList.length() > 0) + { + return this->freeList.pop(); + } + else + { + if (this->count < this->maxCount) + { + ret = (ICBuffer *) ic_malloc0(global_param.Gp_max_packet_size + sizeof(ICBuffer)); + this->count++; + ret->conn = NULL; + ret->nRetry = 0; + icBufferListInitHeadLink(&ret->primary); + icBufferListInitHeadLink(&ret->secondary); + ret->unackQueueRingSlot = 0; + } + else + { + return NULL; + } + } + + return ret; +} + +static struct rto_hashstore* +initRTOHashstore() +{ + int i; + struct rto_hashstore* hs = (struct rto_hashstore*)ic_malloc(sizeof(struct rto_hashstore)); + + for (i = 0; i < RTO_HASH; i++) + TAILQ_INIT(&hs->rto_list[i]); + + TAILQ_INIT(&hs->rto_list[RTO_HASH]); + + return hs; +} + +static void +initUdpManager(mudp_manager_t mudp) +{ + mudp->rto_store = initRTOHashstore(); + mudp->rto_list_cnt = 0; + mudp->cur_ts = 0; +} + +static inline void +addtoRTOList(mudp_manager_t mudp, UDPConn *cur_stream) +{ + if (!mudp->rto_list_cnt) + { + mudp->rto_store->rto_now_idx = 0; + mudp->rto_store->rto_now_ts = cur_stream->sndvar.ts_rto; + } + + if (cur_stream->on_rto_idx < 0 ) + { + if (cur_stream->on_timewait_list) + return; + + int diff = (int32_t)(cur_stream->sndvar.ts_rto - mudp->rto_store->rto_now_ts); + if (diff < RTO_HASH) + { + int offset= (diff + mudp->rto_store->rto_now_idx) % RTO_HASH; + cur_stream->on_rto_idx = offset; + TAILQ_INSERT_TAIL(&(mudp->rto_store->rto_list[offset]), + cur_stream, sndvar.timer_link); + } + else + { + cur_stream->on_rto_idx = RTO_HASH; + TAILQ_INSERT_TAIL(&(mudp->rto_store->rto_list[RTO_HASH]), + cur_stream, sndvar.timer_link); + } + mudp->rto_list_cnt++; + } +} + +static inline void +removeFromRTOList(mudp_manager_t mudp, + UDPConn *cur_stream) +{ + if (cur_stream->on_rto_idx < 0) + return; + + TAILQ_REMOVE(&mudp->rto_store->rto_list[cur_stream->on_rto_idx], + cur_stream, sndvar.timer_link); + cur_stream->on_rto_idx = -1; + + mudp->rto_list_cnt--; +} + +static inline void +updateRetransmissionTimer(mudp_manager_t mudp, + UDPConn *cur_stream, + uint32_t cur_ts) +{ + cur_stream->sndvar.nrtx = 0; + + /* if in rto list, remove it */ + if (cur_stream->on_rto_idx >= 0) + removeFromRTOList(mudp, cur_stream); + + /* Reset retransmission timeout */ + if (UDP_SEQ_GT(cur_stream->snd_nxt, cur_stream->sndvar.snd_una)) + { + /* there are packets sent but not acked */ + /* update rto timestamp */ + cur_stream->sndvar.ts_rto = cur_ts + cur_stream->sndvar.rto; + addtoRTOList(mudp, cur_stream); + } + + if (cur_stream->on_rto_idx == -1) + { + cur_stream->sndvar.ts_rto = cur_ts + cur_stream->sndvar.rto; + addtoRTOList(mudp, cur_stream); + } +} + +static int +handleRTO(mudp_manager_t mudp, + uint32_t cur_ts, + UDPConn *cur_stream, + ICChunkTransportState *transportStates, + TransportEntry *pEntry, + UDPConn *triggerConn) +{ + /* check for expiration */ + int count = 0; + int retransmits = 0; + UDPConn *currBuffConn = NULL; + uint32_t now = cur_ts; + + Assert(unack_queue_ring.currentTime != 0); + removeFromRTOList(mudp, cur_stream); + + while (now >= (unack_queue_ring.currentTime + TIMER_SPAN) && count++ < UNACK_QUEUE_RING_SLOTS_NUM) + { + /* expired, need to resend them */ + ICBuffer *curBuf = NULL; + + while ((curBuf = unack_queue_ring.slots[unack_queue_ring.idx].pop()) != NULL) + { + curBuf->nRetry++; + currBuffConn = static_cast(curBuf->conn); + putIntoUnackQueueRing( + &unack_queue_ring, + curBuf, + currBuffConn->computeExpirationPeriod(curBuf->nRetry), now); + +#ifdef TRANSFER_PROTOCOL_STATS + trans_proto_stats.update(TPE_DATA_PKT_SEND, curBuf->pkt); +#endif + + currBuffConn->sendOnce(curBuf->pkt); + + retransmits++; + ic_statistics.retransmits++; + currBuffConn->stat_count_resent++; + currBuffConn->stat_max_resent = Max(currBuffConn->stat_max_resent, currBuffConn->stat_count_resent); + UDPConn::checkNetworkTimeout(curBuf, now, &transportStates->networkTimeoutIsLogged); + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "RESEND pkt with seq %d (retry %d, rtt " UINT64_FORMAT ") to route %d", + curBuf->pkt->seq, curBuf->nRetry, curBuf->conn->rtt, curBuf->conn->route); + logPkt("RESEND PKT in checkExpiration", curBuf->pkt); +#endif + } + + unack_queue_ring.currentTime += TIMER_SPAN; + unack_queue_ring.idx = (unack_queue_ring.idx + 1) % (UNACK_QUEUE_RING_SLOTS_NUM); + } + return 0; +} + +static inline void +rearrangeRTOStore(mudp_manager_t mudp) +{ + UDPConn *walk, *next; + struct rto_hashstore::rto_head* rto_list = &mudp->rto_store->rto_list[RTO_HASH]; + int cnt = 0; + + for (walk = TAILQ_FIRST(rto_list); walk != NULL; walk = next) + { + next = TAILQ_NEXT(walk, sndvar.timer_link); + + int diff = (int32_t)(mudp->rto_store->rto_now_ts - walk->sndvar.ts_rto); + if (diff < RTO_HASH) + { + int offset = (diff + mudp->rto_store->rto_now_idx) % RTO_HASH; + TAILQ_REMOVE(&mudp->rto_store->rto_list[RTO_HASH], + walk, sndvar.timer_link); + walk->on_rto_idx = offset; + TAILQ_INSERT_TAIL(&(mudp->rto_store->rto_list[offset]), + walk, sndvar.timer_link); + } + cnt++; + } +} + +static inline void +checkRtmTimeout(mudp_manager_t mudp, + uint32_t cur_ts, + int thresh, + ICChunkTransportState *transportStates, + TransportEntry *pEntry, + UDPConn *triggerConn) +{ + UDPConn *walk, *next; + struct rto_hashstore::rto_head* rto_list; + int cnt; + + if (!mudp->rto_list_cnt) + return; + + cnt = 0; + + while (1) + { + rto_list = &mudp->rto_store->rto_list[mudp->rto_store->rto_now_idx]; + if ((int32_t)(cur_ts - mudp->rto_store->rto_now_ts) < 0) + break; + + for (walk = TAILQ_FIRST(rto_list); walk != NULL; walk = next) + { + if (++cnt > thresh) + break; + next = TAILQ_NEXT(walk, sndvar.timer_link); + + if (walk->on_rto_idx >= 0) + { + TAILQ_REMOVE(rto_list, walk, sndvar.timer_link); + mudp->rto_list_cnt--; + walk->on_rto_idx = -1; + handleRTO(mudp, cur_ts, walk, transportStates, pEntry, triggerConn); + } + } + + if (cnt > thresh) + { + break; + } + else + { + mudp->rto_store->rto_now_idx = (mudp->rto_store->rto_now_idx + 1) % RTO_HASH; + mudp->rto_store->rto_now_ts++; + if (!(mudp->rto_store->rto_now_idx % 1000)) + rearrangeRTOStore(mudp); + } + + } +} + +/* + * estimateRTT - Dynamically estimates the Round-Trip Time (RTT) and adjusts Retransmission Timeout (RTO) + * + * This function implements a variant of the Jacobson/Karels algorithm for RTT estimation, adapted for UDP-based + * motion control connections. It updates smoothed RTT (srtt), mean deviation (mdev), and RTO values based on + * newly measured RTT samples (mrtt). The RTO calculation ensures reliable data transmission over unreliable networks. + * + * Key Components: + * - srtt: Smoothed Round-Trip Time (weighted average of historical RTT samples) + * - mdev: Mean Deviation (measure of RTT variability) + * - rttvar: Adaptive RTT variation bound (used to clamp RTO updates) + * - rto: Retransmission Timeout (dynamically adjusted based on srtt + rttvar) + * + * Algorithm Details: + * 1. For the first RTT sample: + * srtt = mrtt << 3 (scaled by 8 for fixed-point arithmetic) + * mdev = mrtt << 1 (scaled by 2) + * rttvar = max(mdev, rto_min) + * 2. For subsequent samples: + * Delta = mrtt - (srtt >> 3) (difference between new sample and smoothed RTT) + * srtt += Delta (update srtt with 1/8 weight of new sample) + * Delta = abs(Delta) - (mdev >> 2) + * mdev += Delta (update mdev with 1/4 weight) + * 3. rttvar bounds the maximum RTT variation: + * If mdev > mdev_max, update mdev_max and rttvar + * On new ACKs (snd_una > rtt_seq), decay rttvar toward mdev_max + * 4. Final RTO calculation: + * rto = (srtt >> 3) + rttvar (clamped to RTO_MAX) + * + * Parameters: + * @mConn: Parent motion connection context (container of MotionConnUDP) + * @mrtt: Measured Round-Trip Time (in microseconds) for the latest packet + * + * Notes: + * - Designed for non-retransmitted packets to avoid sampling bias. + * - Uses fixed-point arithmetic to avoid floating-point operations. + * - Minimum RTO (rto_min) is set to 20ms (HZ/5/10, assuming HZ=100). + * - Critical for adaptive timeout control in UDP protocols where reliability is implemented at the application layer. + * - Thread-unsafe: Must be called in a synchronized context (e.g., packet processing loop). + */ +static inline void +estimateRTT(UDPConn *conn , uint32_t mrtt) +{ + /* This function should be called for not retransmitted packets */ + /* TODO: determine rto_min */ + + long m = mrtt; + uint32_t rto_min = UDP_RTO_MIN / 10; + + if (m == 0) + m = 1; + + /* + * Special RTO optimization for high-speed networks: + * When measured RTT (m) is below 100 microseconds and current RTO is under 10ms, + * forcibly set RTO to half of RTO_MIN. This targets two scenarios: + * - Loopback interfaces (localhost communication) + * - Ultra-low-latency networks (e.g., InfiniBand, RDMA) + */ + if(m < 100 && conn->rttvar.rto < 10000) + { + conn->rttvar.rto = RTO_MIN / 2; + } + + if (conn->rttvar.srtt != 0) + { + /* rtt = 7/8 rtt + 1/8 new */ + m -= (conn->rttvar.srtt >> LOSS_THRESH); + conn->rttvar.srtt += m; + if (m < 0) + { + m = -m; + m -= (conn->rttvar.mdev >> RTT_SHIFT_ALPHA); + if (m > 0) + m >>= LOSS_THRESH; + } + else + { + m -= (conn->rttvar.mdev >> RTT_SHIFT_ALPHA); + } + conn->rttvar.mdev += m; + if (conn->rttvar.mdev > conn->rttvar.mdev_max) + { + conn->rttvar.mdev_max = conn->rttvar.mdev; + if (conn->rttvar.mdev_max > conn->rttvar.rttvar) + { + conn->rttvar.rttvar = conn->rttvar.mdev_max; + } + } + if (UDP_SEQ_GT(conn->rttvar.snd_una, conn->rttvar.rtt_seq)) + { + if (conn->rttvar.mdev_max < conn->rttvar.rttvar) + { + conn->rttvar.rttvar -= (conn->rttvar.rttvar - conn->rttvar.mdev_max) >> RTT_SHIFT_ALPHA; + } + conn->rttvar.mdev_max = rto_min; + } + } + else + { + /* fresh measurement */ + conn->rttvar.srtt = m << LOSS_THRESH; + conn->rttvar.mdev = m << 1; + conn->rttvar.mdev_max = conn->rttvar.rttvar = MAX(conn->rttvar.mdev, rto_min); + } + + conn->rttvar.rto = ((conn->rttvar.srtt >> LOSS_THRESH) + conn->rttvar.rttvar) > RTO_MAX ? RTO_MAX : ((conn->rttvar.srtt >> LOSS_THRESH) + conn->rttvar.rttvar); +} + +/* + * addCRC + * add CRC field to the packet. + */ +static void +addCRC(icpkthdr *pkt) +{ + pkt->crc = ComputeCRC(pkt, pkt->len); +} + +/* + * checkCRC + * check the validity of the packet. + */ +static bool +checkCRC(icpkthdr *pkt) +{ + uint32 rx_crc, + local_crc; + + rx_crc = pkt->crc; + pkt->crc = 0; + local_crc = ComputeCRC(pkt, pkt->len); + if (rx_crc != local_crc) + { + return false; + } + + return true; +} + + +/* + * checkRxThreadError + * Check whether there was error in the background thread in main thread. + * + * If error found, report it. + */ +static void +checkRxThreadError() +{ + int eno; + + eno = ic_atomic_read_u32(&ic_control_info.eno); + if (eno != 0) + { + errno = eno; + + std::stringstream ss; + ss <<"ERROR, interconnect encountered an error, in receive background thread: "<= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG3, "UDP-IC: setsockopt %s failed to set buffer size = %d bytes: %m", + buffer_type == SO_SNDBUF ? "send": "receive", + curr_size); + + curr_size = curr_size >> 1; + if (curr_size < UDPIC_MIN_BUF_SIZE) + return -1; + } + + if(session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG3, "UDP-IC: socket %s current buffer size = %d bytes", + buffer_type == SO_SNDBUF ? "send": "receive", + curr_size); + + return curr_size; +} + + +/* + * setupUDPListeningSocket + * Setup udp listening socket. + */ +static void +setupUDPListeningSocket(int *listenerSocketFd, int32 *listenerPort, int *txFamily, struct sockaddr_storage *listenerSockaddr) +{ + struct addrinfo *addrs = NULL; + struct addrinfo *addr; + struct addrinfo hints; + int ret; + int ic_socket = INVALID_SOCKET; + struct sockaddr_storage ic_socket_addr; + int tries = 0; + struct sockaddr_storage listenerAddr; + socklen_t listenerAddrlen = sizeof(ic_socket_addr); + uint32 socketSendBufferSize; + uint32 socketRecvBufferSize; + + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; /* Allow IPv4 or IPv6 */ + hints.ai_socktype = SOCK_DGRAM; /* Datagram socket */ + hints.ai_protocol = 0; + hints.ai_addrlen = 0; + hints.ai_addr = NULL; + hints.ai_canonname = NULL; + hints.ai_next = NULL; + hints.ai_flags |= AI_NUMERICHOST; + +#ifdef USE_ASSERT_CHECKING + if (session_param.gp_udpic_network_disable_ipv6) + hints.ai_family = AF_INET; +#endif + + if (global_param.Gp_interconnect_address_type == INTERCONNECT_ADDRESS_TYPE_UNICAST_IC) + { + Assert(global_param.interconnect_address && strlen(global_param.interconnect_address) > 0); + hints.ai_flags |= AI_NUMERICHOST; + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG3, "getaddrinfo called with unicast address: %s", global_param.interconnect_address); + } + else + { + Assert(global_param.interconnect_address == NULL); + hints.ai_flags |= AI_PASSIVE; + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG3, "getaddrinfo called with wildcard address"); + } + + /* + * Restrict what IP address we will listen on to just the one that was + * used to create this QE session. + */ + Assert(global_param.interconnect_address && strlen(global_param.interconnect_address) > 0); + ret = getaddrinfo((!global_param.interconnect_address || global_param.interconnect_address[0] == '\0') ? NULL : global_param.interconnect_address, + NULL, &hints, &addrs); + if (ret || !addrs) + { + LOG(INFO, "could not resolve address for UDP IC socket %s: %s", + global_param.interconnect_address, + gai_strerror(ret)); + goto startup_failed; + } + + /* + * On some platforms, pg_getaddrinfo_all() may return multiple addresses + * only one of which will actually work (eg, both IPv6 and IPv4 addresses + * when kernel will reject IPv6). Worse, the failure may occur at the + * bind() or perhaps even connect() stage. So we must loop through the + * results till we find a working combination. We will generate DEBUG + * messages, but no error, for bogus combinations. + */ + for (addr = addrs; addr != NULL; addr = addr->ai_next) + { +#ifdef HAVE_UNIX_SOCKETS + /* Ignore AF_UNIX sockets, if any are returned. */ + if (addr->ai_family == AF_UNIX) + continue; +#endif + + if (++tries > 1 && session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG3, "trying another address for UDP interconnect socket"); + + ic_socket = socket(addr->ai_family, addr->ai_socktype, addr->ai_protocol); + if (ic_socket == INVALID_SOCKET) + { + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG3, "could not create UDP interconnect socket: %m"); + continue; + } + + /* + * Bind the socket to a kernel assigned ephemeral port on the + * interconnect_address. + */ + if (bind(ic_socket, addr->ai_addr, addr->ai_addrlen) < 0) + { + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG3, "could not bind UDP interconnect socket: %m"); + closesocket(ic_socket); + ic_socket = INVALID_SOCKET; + continue; + } + + /* Call getsockname() to eventually obtain the assigned ephemeral port */ + if (getsockname(ic_socket, (struct sockaddr *) &listenerAddr, &listenerAddrlen) < 0) + { + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG3, "could not get address of UDP interconnect socket: %m"); + closesocket(ic_socket); + ic_socket = INVALID_SOCKET; + continue; + } + + /* If we get here, we have a working socket */ + break; + } + + if (!addr || ic_socket == INVALID_SOCKET) + goto startup_failed; + + /* Memorize the socket fd, kernel assigned port and address family */ + *listenerSocketFd = ic_socket; + if (listenerAddr.ss_family == AF_INET6) + { + *listenerPort = ntohs(((struct sockaddr_in6 *) &listenerAddr)->sin6_port); + *txFamily = AF_INET6; + } + else + { + *listenerPort = ntohs(((struct sockaddr_in *) &listenerAddr)->sin_port); + *txFamily = AF_INET; + } + + /* + * cache the successful sockaddr of the listening socket, so + * we can use this information to connect to the listening socket. + */ + if (listenerSockaddr != NULL) + memcpy(listenerSockaddr, &listenerAddr, sizeof(struct sockaddr_storage)); + + /* Set up socket non-blocking mode */ + if (!ic_set_noblock(ic_socket)) + { + LOG(INFO, "could not set UDP interconnect socket to nonblocking mode: %s", strerror(errno)); + goto startup_failed; + } + + /* Set up the socket's send and receive buffer sizes. */ + socketRecvBufferSize = setUDPSocketBufferSize(ic_socket, SO_RCVBUF); + if (socketRecvBufferSize == static_cast(-1)) + goto startup_failed; + ic_control_info.socketRecvBufferSize = socketRecvBufferSize; + + socketSendBufferSize = setUDPSocketBufferSize(ic_socket, SO_SNDBUF); + if (socketSendBufferSize == static_cast(-1)) + goto startup_failed; + ic_control_info.socketSendBufferSize = socketSendBufferSize; + + if (addrs != NULL) + freeaddrinfo(addrs); + return; + +startup_failed: + if (addrs) + freeaddrinfo(addrs); + if (ic_socket != INVALID_SOCKET) + { + closesocket(ic_socket); + } + + std::stringstream ss; + ss << "ERROR,interconnect error: Could not set up udp listener socket: " << strerror(errno); + throw ICNetworkException(ss.str(), __FILE__, __LINE__); +} + +/* + * InitMutex + * Initialize mutex. + */ +static void +initMutex(pthread_mutex_t *mutex) +{ + pthread_mutexattr_t m_atts; + + pthread_mutexattr_init(&m_atts); + pthread_mutexattr_settype(&m_atts, PTHREAD_MUTEX_ERRORCHECK); + + pthread_mutex_init(mutex, &m_atts); +} + +/* + * Set up the udp interconnect pthread signal mask, we don't want to run our signal handlers + */ +static void +ic_set_pthread_sigmasks(sigset_t *old_sigs) +{ +#ifndef WIN32 + sigset_t sigs; + int err; + + sigfillset(&sigs); + + err = pthread_sigmask(SIG_BLOCK, &sigs, old_sigs); + if (err != 0) + { + std::stringstream ss; + ss << "ERROR: Failed to get pthread signal masks with return value: "<= 0) + closesocket(ICSenderSocket); + ICSenderSocket = -1; + ICSenderPort = 0; + ICSenderFamily = 0; + + memset(&udp_dummy_packet_sockaddr, 0, sizeof(udp_dummy_packet_sockaddr)); + +#ifdef USE_ASSERT_CHECKING + + /* + * Check malloc times, in Interconnect part, memory are carefully released + * in tear down code (even when error occurred). But if a FATAL error is + * reported, tear down code will not be executed. Thus, it is still + * possible the malloc times and free times do not match when we reach + * here. The process will die in this case, the mismatch does not + * introduce issues. + */ + if (icudp_malloc_times != 0) + LOG(INFO, "WARNING: malloc times and free times do not match. remain alloc times: %ld", icudp_malloc_times); +#endif +} + +/* + * getSockAddr + * Convert IP addr and port to sockaddr + */ +static void +getSockAddr(struct sockaddr_storage *peer, socklen_t *peer_len, const char *listenerAddr, int listenerPort) +{ + int ret; + char portNumberStr[32]; + char *service; + struct addrinfo *addrs = NULL; + struct addrinfo hint; + + /* + * Get socketaddr to connect to. + */ + + /* Initialize hint structure */ + memset(&hint, 0, sizeof(hint)); + hint.ai_socktype = SOCK_DGRAM; /* UDP */ + hint.ai_family = AF_UNSPEC; /* Allow for any family (v4, v6, even unix in + * the future) */ +#ifdef AI_NUMERICSERV + hint.ai_flags = AI_NUMERICHOST | AI_NUMERICSERV; /* Never do name + * resolution */ +#else + hint.ai_flags = AI_NUMERICHOST; /* Never do name resolution */ +#endif + + snprintf(portNumberStr, sizeof(portNumberStr), "%d", listenerPort); + service = portNumberStr; + + addrs = NULL; + /* NULL has special meaning to getaddrinfo(). */ + ret = getaddrinfo((!listenerAddr || listenerAddr[0] == '\0') ? NULL : listenerAddr, + service, &hint, &addrs); + if (ret || !addrs) + { + if (addrs) + freeaddrinfo(addrs); + + std::stringstream ss; + ss<<"ERROR, interconnect error: Could not parse remote listener address: '"<ai_family, addrs->ai_socktype, addrs->ai_protocol, listenerAddr); + memset(peer, 0, sizeof(struct sockaddr_storage)); + memcpy(peer, addrs->ai_addr, addrs->ai_addrlen); + *peer_len = addrs->ai_addrlen; + + if (addrs) + freeaddrinfo(addrs); +} + +/* + * format_sockaddr + * Format a sockaddr to a human readable string + * + * This function must be kept threadsafe, elog/ereport/palloc etc are not + * allowed within this function. + */ +static char * +format_sockaddr_udp(struct sockaddr_storage *sa, char *buf, size_t len) +{ + int ret; + char remote_host[NI_MAXHOST]; + char remote_port[NI_MAXSERV]; + + ret = getnameinfo((const struct sockaddr *)sa, sizeof(struct sockaddr_storage), + remote_host, sizeof(remote_host), + remote_port, sizeof(remote_port), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ret != 0) + { + strncpy(remote_host, "???", sizeof(remote_host)); + strncpy(remote_port, "???", sizeof(remote_port)); + } + + if (ret != 0) + snprintf(buf, len, "?host?:?port?"); + else + { +#ifdef HAVE_IPV6 + if (sa->ss_family == AF_INET6) + snprintf(buf, len, "[%s]:%s", remote_host, remote_port); + else +#endif + snprintf(buf, len, "%s:%s", remote_host, remote_port); + } + + return buf; +} + +/* + * setupOutgoingUDPConnection + * Setup outgoing UDP connection. + */ +static void +setupOutgoingUDPConnection(int icid, TransportEntry *pEntry, UDPConn *conn) +{ + ICCdbProcess *cdbProc = NULL; + + Assert(pEntry); + + cdbProc = conn->cdbProc; + Assert(conn->state == mcsSetupOutgoingConnection); + Assert(conn->cdbProc); + + conn->remoteContentId = cdbProc->contentid; + conn->stat_min_ack_time = ~((uint64) 0); + + /* Save the information for the error message if getaddrinfo fails */ + if (strchr(cdbProc->listenerAddr, ':') != 0) + snprintf(conn->remoteHostAndPort, sizeof(conn->remoteHostAndPort), + "[%s]:%d", cdbProc->listenerAddr, cdbProc->listenerPort); + else + snprintf(conn->remoteHostAndPort, sizeof(conn->remoteHostAndPort), + "%s:%d", cdbProc->listenerAddr, cdbProc->listenerPort); + + /* + * Get socketaddr to connect to. + */ + getSockAddr(&conn->peer, &conn->peer_len, cdbProc->listenerAddr, cdbProc->listenerPort); + + /* Save the destination IP address */ + format_sockaddr_udp(&conn->peer, conn->remoteHostAndPort, + sizeof(conn->remoteHostAndPort)); + + Assert(conn->peer.ss_family == AF_INET || conn->peer.ss_family == AF_INET6); + + { +#ifdef USE_ASSERT_CHECKING + { + struct sockaddr_storage source_addr; + socklen_t source_addr_len; + + memset(&source_addr, 0, sizeof(source_addr)); + source_addr_len = sizeof(source_addr); + + if (getsockname(pEntry->txfd, (struct sockaddr *) &source_addr, &source_addr_len) == -1) + { + throw ICNetworkException(std::string("ERROR, interconnect Error: Could not get port from socket, %m")+strerror(errno), __FILE__, __LINE__); + } + Assert(pEntry->txfd_family == source_addr.ss_family); + } +#endif + + /* + * If the socket was created with a different address family than the + * place we are sending to, we might need to do something special. + */ + if (pEntry->txfd_family != conn->peer.ss_family) + { + /* + * If the socket was created AF_INET6, but the address we want to + * send to is IPv4 (AF_INET), we might need to change the address + * format. On Linux, it isn't necessary: glibc automatically + * handles this. But on MAC OSX and Solaris, we need to convert + * the IPv4 address to an V4-MAPPED address in AF_INET6 format. + */ + if (pEntry->txfd_family == AF_INET6) + { + LOG(DEBUG1, "We are inet6, remote is inet. Converting to v4 mapped address."); + ConvertToIPv4MappedAddr(&conn->peer, &conn->peer_len); + } + else + { + /* + * If we get here, something is really wrong. We created the + * socket as IPv4-only (AF_INET), but the address we are + * trying to send to is IPv6. It's possible we could have a + * V4-mapped address that we could convert to an IPv4 address, + * but there is currently no code path where that could + * happen. So this must be an error. + */ + throw ICNetworkException("ERROR: Trying to use an IPv4 (AF_INET) socket to send to an IPv6 address", __FILE__, __LINE__); + } + } + } + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "Interconnect connecting to seg%d slice%d %s pid=%d sockfd=%d", + conn->remoteContentId, pEntry->recvSlice->sliceIndex, conn->remoteHostAndPort, conn->cdbProc->pid, conn->sockfd); + + /* send connection request */ + memset(&conn->conn_info, 0, sizeof(conn->conn_info)); + conn->conn_info.len = 0; + conn->conn_info.flags = 0; + conn->conn_info.motNodeId = pEntry->motNodeId; + + conn->conn_info.recvSliceIndex = pEntry->recvSlice->sliceIndex; + conn->conn_info.sendSliceIndex = pEntry->sendSlice->sliceIndex; + conn->conn_info.srcContentId = global_param.segindex; + conn->conn_info.dstContentId = conn->cdbProc->contentid; + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "setupOutgoingUDPConnection: node %d route %d srccontent %d dstcontent %d: %s", + pEntry->motNodeId, conn->route, global_param.segindex, conn->cdbProc->contentid, conn->remoteHostAndPort); + + conn->conn_info.srcListenerPort = UDP2_GetListenPortUDP(); + conn->conn_info.srcPid = global_param.MyProcPid; + conn->conn_info.dstPid = conn->cdbProc->pid; + conn->conn_info.dstListenerPort = conn->cdbProc->listenerPort; + + conn->conn_info.sessionId = session_param.gp_session_id; + conn->conn_info.icId = icid; + + ic_control_info.connHtab.add(conn); + + /* + * No need to get the connection lock here, since background rx thread + * will never access send connections. + */ + conn->msgPos = NULL; + conn->msgSize = sizeof(conn->conn_info); + conn->stillActive = true; + conn->conn_info.seq = 1; + conn->rttvar.ts_rto = 0; + conn->rttvar.rto = UDP_INITIAL_RTO; + conn->rttvar.srtt = 0; + conn->rttvar.rttvar = 0; + conn->rttvar.snd_una = 0; + conn->rttvar.nrtx = 0; + conn->rttvar.max_nrtx = 0; + conn->rttvar.mss = UDP_DEFAULT_MSS; + conn->rttvar.cwnd = 2; + conn->rttvar.ssthresh = UDP_INFINITE_SSTHRESH; + conn->rttvar.loss_count = 0; + conn->rttvar.karn_mode = false; + conn->on_rto_idx = -1; + Assert(conn->peer.ss_family == AF_INET || conn->peer.ss_family == AF_INET6); +} + +/* + * If the socket was created AF_INET6, but the address we want to + * send to is IPv4 (AF_INET), we need to change the address + * format. On Linux, this is not necessary: glibc automatically + * handles this. But on MAC OSX and Solaris, we need to convert + * the IPv4 address to IPv4-mapped IPv6 address in AF_INET6 format. + * + * The comment above relies on getaddrinfo() via function getSockAddr to get + * the correct V4-mapped address. We need to be careful here as we need to + * ensure that the platform we are using is POSIX 1003-2001 compliant. + * Just to be on the safeside, we'll be keeping this function for + * now to be used for all platforms and not rely on POSIX. + * + * Since this can be called in a signal handler, we avoid the use of + * async-signal unsafe functions such as memset/memcpy + */ +static void +ConvertToIPv4MappedAddr(struct sockaddr_storage *sockaddr, socklen_t *o_len) +{ + const struct sockaddr_in *in = (const struct sockaddr_in *)sockaddr; + struct sockaddr_storage temp = {0}; + struct sockaddr_in6 *in6_new = (struct sockaddr_in6 *)&temp; + + /* Construct a IPv4-to-IPv6 mapped address. */ + temp.ss_family = AF_INET6; + in6_new->sin6_family = AF_INET6; + in6_new->sin6_port = in->sin_port; + in6_new->sin6_flowinfo = 0; + + ((uint16 *)&in6_new->sin6_addr)[5] = 0xffff; + + in6_new->sin6_addr.s6_addr32[3] = in->sin_addr.s_addr; + in6_new->sin6_scope_id = 0; + + /* copy it back */ + *sockaddr = temp; + *o_len = sizeof(struct sockaddr_in6); +} + +#if defined(__darwin__) +/* macos does not accept :: as the destination, we will need to covert this to the IPv6 loopback */ +static void +ConvertIPv6WildcardToLoopback(struct sockaddr_storage *dest) +{ + char address[INET6_ADDRSTRLEN]; + /* we want to terminate our own process, so this should be local */ + const struct sockaddr_in6 *in6 = (const struct sockaddr_in6 *)&udp_dummy_packet_sockaddr; + inet_ntop(AF_INET6, &in6->sin6_addr, address, sizeof(address)); + if (strcmp("::", address) == 0) + ((struct sockaddr_in6 *)dest)->sin6_addr = in6addr_loopback; +} +#endif + +/* + * handleCachedPackets + * Deal with cached packets. + */ +static void +handleCachedPackets(void) +{ + UDPConn *cachedConn = NULL; + UDPConn *setupConn = NULL; + ConnHtabBin *bin = NULL; + icpkthdr *pkt = NULL; + AckSendParam param; + int i = 0; + uint32 j = 0; + bool dummy; + + for (i = 0; i < ic_control_info.startupCacheHtab.size; i++) + { + bin = ic_control_info.startupCacheHtab.table[i]; + + while (bin) + { + cachedConn = bin->conn; + setupConn = NULL; + + for (j = 0; j < cachedConn->pkt_q_size; j++) + { + pkt = (icpkthdr *) cachedConn->pkt_q[j]; + + if (pkt == NULL) + continue; + + rx_buffer_pool.maxCount--; + + /* look up this pkt's connection in connHtab */ + setupConn = ic_control_info.connHtab.find(pkt); + if (setupConn == NULL) + { + /* mismatch! */ + rx_buffer_pool.put(pkt); + cachedConn->pkt_q[j] = NULL; + continue; + } + + memset(¶m, 0, sizeof(param)); + if (!handleDataPacket(setupConn, pkt, &cachedConn->peer, &cachedConn->peer_len, ¶m, &dummy)) + { + /* no need to cache this packet */ + rx_buffer_pool.put(pkt); + } + + ic_statistics.recvPktNum++; + if (param.msg.len != 0) + UDPConn::sendAckWithParam(¶m); + + cachedConn->pkt_q[j] = NULL; + } + bin = bin->next; + ic_control_info.startupCacheHtab.remove(cachedConn); + + /* + * MPP-19981 free the cached connections; otherwise memory leak + * would be introduced. + */ + ic_free(cachedConn->pkt_q); + delete cachedConn; + } + } +} + +/* + * CChunkTransportStateImpl::setup + * Internal function for setting up UDP interconnect. + */ +ICChunkTransportState* +CChunkTransportStateImpl::setup(ICSliceTable *sliceTable) +{ + pthread_mutex_lock(&ic_control_info.lock); + + Assert(sliceTable->ic_instance_id > 0); + + if (global_param.Gp_role == GP_ROLE_DISPATCH_IC) + { + /* + * QD use cursorHistoryTable to handle mismatch packets, no + * need to update ic_control_info.ic_instance_id + */ + Assert(session_param.gp_interconnect_id == sliceTable->ic_instance_id); + } + else + { + /* + * update ic_control_info.ic_instance_id, it is mainly used + * by rx thread to handle mismatch packets + */ + ic_control_info.ic_instance_id = sliceTable->ic_instance_id; + } + + CChunkTransportStateImpl *state_impl = new CChunkTransportStateImpl(sliceTable); + ICChunkTransportState *interconnect_context = static_cast(state_impl); + CChunkTransportStateImpl::state_ = static_cast(state_impl); + +#ifdef USE_ASSERT_CHECKING + ICExecSlice *mySlice = &interconnect_context->sliceTable->slices[sliceTable->localSlice]; + Assert(mySlice && mySlice->sliceIndex == sliceTable->localSlice); +#endif + +#ifdef USE_ASSERT_CHECKING + set_test_mode(); +#endif + + if (global_param.Gp_role == GP_ROLE_DISPATCH_IC) + { + CursorICHistoryTable *ich_table = &rx_control_info.cursorHistoryTable; + //DistributedTransactionId distTransId = getDistributedTransactionId(); TODO: add callback; + DistributedTransactionId distTransId = InvalidTransactionId; + + if (ich_table->count > (2 * ich_table->size)) + { + /* + * distTransId != lastDXatId + * Means the last transaction is finished, it's ok to make a prune. + */ + if (distTransId != rx_control_info.lastDXatId) + { + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "prune cursor history table (count %d), icid %d, prune_id %d", + ich_table->count, sliceTable->ic_instance_id, sliceTable->ic_instance_id); + ich_table->prune(sliceTable->ic_instance_id); + } + /* + * distTransId == lastDXatId and they are not InvalidTransactionId(0) + * Means current (non Read-Only) transaction isn't finished, should not prune. + */ + else if (rx_control_info.lastDXatId != InvalidTransactionId) + { + ; + } + /* + * distTransId == lastDXatId and they are InvalidTransactionId(0) + * Means they are the same transaction or different Read-Only transactions. + * + * For the latter, it's hard to get a perfect timepoint to prune: prune eagerly may + * cause problems (pruned current Txn's Ic instances), but prune in low frequency + * causes memory leak. + * + * So, we choose a simple algorithm to prune it here. And if it mistakenly prune out + * the still-in-used Ic instance (with lower id), the query may hang forever. + * Then user have to set a bigger gp_interconnect_cursor_ic_table_size value and + * try the query again, it is a workaround. + * + * More backgrounds please see: https://github.com/greenplum-db/gpdb/pull/16458 + */ + else + { + if (sliceTable->ic_instance_id > ich_table->size) + { + uint32 prune_id = sliceTable->ic_instance_id - ich_table->size; + Assert(prune_id < sliceTable->ic_instance_id); + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "prune cursor history table (count %d), icid %d, prune_id %d", + ich_table->count, sliceTable->ic_instance_id, prune_id); + ich_table->prune(prune_id); + } + } + } + + ich_table->add(sliceTable->ic_instance_id, session_param.gp_command_count); + /* save the latest transaction id */ + rx_control_info.lastDXatId = distTransId; + } + + /* Initiate receiving connections. */ + state_impl->CreateRecvEntries(sliceTable); + + /* Initiate outgoing connections. */ + state_impl->CreateSendEntries(sliceTable); + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "SetupUDPInterconnect will activate Listening on ports=%d/%d sockfd=%d.", 0, UDP2_GetListenPortUDP(), UDP_listenerFd); + + /* + * If there are packets cached by background thread, add them to the + * connections. + */ + if (session_param.gp_interconnect_cache_future_packets) + handleCachedPackets(); + + interconnect_context->activated = true; + + pthread_mutex_unlock(&ic_control_info.lock); + + return interconnect_context; +} + +/* + * sendControlMessage + * Helper function to send a control message. + */ +void +UDPConn::sendControlMessage(icpkthdr *pkt, int fd, struct sockaddr *addr, socklen_t peerLen) +{ + int n; + +#ifdef USE_ASSERT_CHECKING + if (testmode_inject_fault(session_param.gp_udpic_dropacks_percent)) + { +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "THROW CONTROL MESSAGE with seq %d extraSeq %d srcpid %d despid %d", pkt->seq, pkt->extraSeq, pkt->srcPid, pkt->dstPid); +#endif + return; + } +#endif + + /* Add CRC for the control message. */ + if (session_param.gp_interconnect_full_crc) + addCRC(pkt); + + /* retry 10 times for sending control message */ + int counter = 0; + while (counter < 10) + { + counter++; + n = sendto(fd, (const char *)pkt, pkt->len, 0, addr, peerLen); + if (n < 0) + { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + continue; + else + { + LOG(INFO, "sendcontrolmessage: got errno %d", errno); + return; + } + } + break; + } + if (n < int(pkt->len)) + LOG(INFO, "sendcontrolmessage: got error %d errno %d seq %d", n, errno, pkt->seq); +} + +/* + * setAckParam + * Set the ack sending parameters. + */ +void +UDPConn::setAckParam(AckSendParam *param, int32 flags, uint32 seq, uint32 extraSeq) +{ + memcpy(¶m->msg, (char *) &this->conn_info, sizeof(icpkthdr)); + param->msg.flags = flags; + param->msg.seq = seq; + param->msg.extraSeq = extraSeq; + param->msg.len = sizeof(icpkthdr); + param->peer = this->peer; + param->peer_len = this->peer_len; +} + +/* + * sendAckWithParam + * Send acknowledgment to sender. + */ +void +UDPConn::sendAckWithParam(AckSendParam *param) +{ + sendControlMessage(¶m->msg, UDP_listenerFd, (struct sockaddr *) ¶m->peer, param->peer_len); +} + +/* + * sendAck + * Send acknowledgment to sender. + */ +void +UDPConn::sendAck(int32 flags, uint32 seq, uint32 extraSeq) +{ + icpkthdr msg; + + memcpy(&msg, (char *) &this->conn_info, sizeof(msg)); + msg.flags = flags; + msg.seq = seq; + msg.extraSeq = extraSeq; + msg.len = sizeof(icpkthdr); + + LOG(DEBUG1, "sendack: node %d route %d seq %d extraSeq %d, flags %s", msg.motNodeId, this->route, msg.seq, msg.extraSeq, flags2txt(msg.flags)); + + sendControlMessage(&msg, UDP_listenerFd, (struct sockaddr *) &this->peer, this->peer_len); +} + +/* + * sendDisorderAck + * Send a disorder message to the sender. + * + * Whenever the receiver detects a disorder packet, it will assemble a disorder message + * which contains the sequence numbers of the possibly lost packets. + * + */ +void +UDPConn::sendDisorderAck(uint32 seq, uint32 extraSeq, uint32 lostPktCnt) +{ + icpkthdr *disorderBuffer = rx_control_info.disorderBuffer; + + memcpy(disorderBuffer, (char *) &this->conn_info, sizeof(icpkthdr)); + + disorderBuffer->flags |= UDPIC_FLAGS_DISORDER; + disorderBuffer->seq = seq; + disorderBuffer->extraSeq = extraSeq; + disorderBuffer->len = lostPktCnt * sizeof(uint32) + sizeof(icpkthdr); + +#ifdef AMS_VERBOSE_LOGGING + if (!(this->peer.ss_family == AF_INET || this->peer.ss_family == AF_INET6)) + { + LOG(INFO, "UDP Interconnect bug (in sendDisorderAck): trying to send ack when we don't know where to send to %s", this->remoteHostAndPort); + } +#endif + + sendControlMessage(disorderBuffer, UDP_listenerFd, (struct sockaddr *) &this->peer, this->peer_len); +} + +/* + * sendStatusQueryMessage + * Used by senders to send a status query message for a connection to receivers. + * + * When receivers get such a message, they will respond with + * the connection status (consumed seq, received seq ...). + */ +void +UDPConn::sendStatusQueryMessage(uint32 seq) +{ + icpkthdr msg; + + memcpy(&msg, (char *) &this->conn_info, sizeof(msg)); + msg.flags = UDPIC_FLAGS_CAPACITY; + msg.seq = seq; + msg.extraSeq = 0; + msg.len = sizeof(msg); + +#ifdef TRANSFER_PROTOCOL_STATS + trans_proto_stats.update(TPE_ACK_PKT_QUERY, &msg); +#endif + + sendControlMessage(&msg, entry_->txfd, (struct sockaddr *) &this->peer, this->peer_len); +} + +/* + * ReleaseBuffer + * Return a buffer and send an acknowledgment. + * + * SHOULD BE CALLED WITH ic_control_info.lock *LOCKED* + */ +void +UDPConn::ReleaseBuffer(AckSendParam *param) +{ + icpkthdr *buf; + uint32 seq; + + buf = (icpkthdr *) this->pkt_q[this->pkt_q_head]; + if (buf == NULL) + { + pthread_mutex_unlock(&ic_control_info.lock); + throw ICFatalException("FATAL: ReleaseBuffer: buffer is NULL", __FILE__, __LINE__); + } + + seq = buf->seq; + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "LOG: ReleaseBuffer conn %p pkt [seq %d] for node %d route %d, [head seq] %d queue size %d, queue head %d queue tail %d", + this, seq, buf->motNodeId, this->route, this->conn_info.seq - this->pkt_q_size, this->pkt_q_size, this->pkt_q_head, this->pkt_q_tail); +#endif + + this->pkt_q[this->pkt_q_head] = NULL; + this->pBuff = NULL; + this->pkt_q_head = (this->pkt_q_head + 1) % this->pkt_q_capacity; + this->pkt_q_size--; + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "LOG: ReleaseBuffer conn %p pkt [seq %d] for node %d route %d, [head seq] %d queue size %d, queue head %d queue tail %d", + this, seq, buf->motNodeId, this->route, this->conn_info.seq - this->pkt_q_size, this->pkt_q_size, this->pkt_q_head, this->pkt_q_tail); +#endif + + rx_buffer_pool.put(buf); + this->conn_info.extraSeq = seq; + + /* Send an Ack to the sender. */ + if ((seq % 2 == 0) || (this->pkt_q_capacity == 1)) + { + if (param != NULL) + { + this->setAckParam(param, UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | this->conn_info.flags, this->conn_info.seq - 1, seq); + } + else + { + this->sendAck(UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | this->conn_info.flags, this->conn_info.seq - 1, seq); + } + } +} + +/* + * computeExpirationPeriod + * Compute expiration period according to the connection information. + * + * Considerations on expiration period computation: + * + * RTT is dynamically computed, and expiration period is based on RTT values. + * We cannot simply use RTT as the expiration value, since real workload does + * not always have a stable RTT. A small constant value is multiplied to the RTT value + * to make the resending logic insensitive to the frequent small changes of RTT. + * + */ +uint64 +UDPConn::computeExpirationPeriod(uint32 retry) +{ + /* + * In fault injection mode, we often use DEFAULT_RTT, because the + * intentional large percent of packet/ack losses will make the RTT too + * large. This will lead to a slow retransmit speed. In real hardware + * environment/workload, we do not expect such a packet loss pattern. + */ +#ifdef USE_ASSERT_CHECKING + if (udp_testmode) + { + return DEFAULT_RTT; + } + else +#endif + { + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + return Min(retry > 3 ? this->rttvar.rto * retry : this->rttvar.rto, UNACK_QUEUE_RING_LENGTH_LOSS); + + uint32 factor = (retry <= 12 ? retry : 12); + return Max(MIN_EXPIRATION_PERIOD, Min(MAX_EXPIRATION_PERIOD, (int)(this->rtt + (this->dev << 2)) << (factor))); + } +} + +/* + * freeDisorderedPackets + * Put the disordered packets into free buffer list. + */ +void +UDPConn::freeDisorderedPackets() +{ + uint32 k; + + if (this->pkt_q == NULL) + return; + + for (k = 0; k < this->pkt_q_capacity; k++) + { + icpkthdr *buf = (icpkthdr *)this->pkt_q[k]; + + if (buf != NULL) + { + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "CLEAR Out-of-order PKT: conn %p pkt [seq %d] for node %d route %d, " + "[head seq] %d queue size %d, queue head %d queue tail %d", + this, buf->seq, buf->motNodeId, this->route, this->conn_info.seq - this->pkt_q_size, + this->pkt_q_size, this->pkt_q_head, this->pkt_q_tail); + + /* return the buffer into the free list. */ + rx_buffer_pool.put(buf); + this->pkt_q[k] = NULL; + } + } +} + +/* + * prepareRxConnForRead + * Prepare the receive connection for reading. + * + * MUST BE CALLED WITH ic_control_info.lock LOCKED. + */ +void +UDPConn::prepareRxConnForRead() +{ + LOG(DEBUG3, "In prepareRxConnForRead: conn %p, q_head %d q_tail %d q_size %d", + this, this->pkt_q_head, this->pkt_q_tail, this->pkt_q_size); + + Assert(this->pkt_q[this->pkt_q_head] != NULL); + this->pBuff = this->pkt_q[this->pkt_q_head]; + this->msgPos = this->pBuff; + this->msgSize = ((icpkthdr *) this->pBuff)->len; + this->recvBytes = this->msgSize; +} + +/* + * DeactiveConn + * Mark the connection inactive. + */ +void +UDPConn::DeactiveConn() +{ + pthread_mutex_lock(&ic_control_info.lock); + this->stillActive = false; + pthread_mutex_unlock(&ic_control_info.lock); +} + +/* + * handleAckedPacket + * Called by sender to process acked packet. + * + * Remove it from unack queue and unack queue ring, change the rtt ... + * + * RTT (Round Trip Time) is computed as the time between we send the packet + * and receive the acknowledgement for the packet. When an acknowledgement + * is received, an estimated RTT value (called SRTT, smoothed RTT) is updated + * by using the following equation. And we also set a limitation of the max + * value and min value for SRTT. + * (1) SRTT = (1 - g) SRTT + g x RTT (0 < g < 1) + * where RTT is the measured round trip time of the packet. In implementation, + * g is set to 1/8. In order to compute expiration period, we also compute an + * estimated delay variance SDEV by using: + * (2) SDEV = (1 - h) x SDEV + h x |SERR| (0 < h < 1, In implementation, h is set to 1/4) + * where SERR is calculated by using: + * (3) SERR = RTT - SRTT + * Expiration period determines the timing we resend a packet. A long RTT means + * a long expiration period. Delay variance is used to incorporate the variance + * of workload/network variances at different time. When a packet is retransmitted, + * we back off exponentially the expiration period. + * (4) exp_period = (SRTT + y x SDEV) << retry + * Here y is a constant (In implementation, we use 4) and retry is the times the + * packet is retransmitted. + */ +void +UDPConn::handleAckedPacket(ICBuffer *buf, uint64 now, struct icpkthdr *pkt) +{ + uint64 ackTime = 0; + bool bufIsHead = false; + UDPConn *bufConn = NULL; + + bufIsHead = (&buf->primary == this->unackQueue.first()); + + buf = this->unackQueue.remove(buf); + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC || session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER_IC) + { + bufConn = static_cast(buf->conn); + buf = unack_queue_ring.slots[buf->unackQueueRingSlot].remove(buf); + unack_queue_ring.numOutStanding--; + if (this->unackQueue.length() >= 1) + unack_queue_ring.numSharedOutStanding--; + + ackTime = now - buf->sentTime; + + if (buf->nRetry == 0) + { + /* adjust the congestion control window. */ + if (snd_control_info.cwnd < snd_control_info.ssthresh) + snd_control_info.cwnd += 2; + else + snd_control_info.cwnd += 1 / snd_control_info.cwnd; + snd_control_info.cwnd = Min(snd_control_info.cwnd, snd_buffer_pool.maxCount); + } + + if ((bufConn->rttvar.rto << 1) > ackTime && pkt->retry_times != session_param.Gp_interconnect_min_retries_before_timeout) + estimateRTT(bufConn, (now - pkt->send_time)); + + if (buf->nRetry && pkt->retry_times > 0 && pkt->retry_times < session_param.Gp_interconnect_min_retries_before_timeout) + bufConn->rttvar.rto += (bufConn->rttvar.rto >> 4 * buf->nRetry); + + if (unlikely(session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER_IC)) + { + bufConn->sndvar.ts_rto = bufConn->rttvar.rto; + addtoRTOList(&mudp, bufConn); + } + } + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_IC) + { + ICBufferList *alist = &unack_queue_ring.slots[buf->unackQueueRingSlot]; + buf = alist->remove(buf); + unack_queue_ring.numOutStanding--; + if (this->unackQueue.length() >= 1) + unack_queue_ring.numSharedOutStanding--; + + ackTime = now - buf->sentTime; + + /* + * In udp_testmode, we do not change rtt dynamically due to the large + * number of packet losses introduced by fault injection code. This + * can decrease the testing time. + */ +#ifdef USE_ASSERT_CHECKING + if (!udp_testmode) +#endif + { + uint64 newRTT = 0; + uint64 newDEV = 0; + + if (buf->nRetry == 0) + { + bufConn = static_cast(buf->conn); + newRTT = bufConn->rtt - (bufConn->rtt >> RTT_SHIFT_COEFFICIENT) + (ackTime >> RTT_SHIFT_COEFFICIENT); + newRTT = Min(MAX_RTT, Max(newRTT, MIN_RTT)); + bufConn->rtt = newRTT; + + newDEV = bufConn->dev - (bufConn->dev >> DEV_SHIFT_COEFFICIENT) + ((Max(ackTime, newRTT) - Min(ackTime, newRTT)) >> DEV_SHIFT_COEFFICIENT); + newDEV = Min(MAX_DEV, Max(newDEV, MIN_DEV)); + bufConn->dev = newDEV; + + /* adjust the congestion control window. */ + if (snd_control_info.cwnd < snd_control_info.ssthresh) + snd_control_info.cwnd += 1; + else + snd_control_info.cwnd += 1 / snd_control_info.cwnd; + snd_control_info.cwnd = Min(snd_control_info.cwnd, snd_buffer_pool.maxCount); + } + } + } + + bufConn = static_cast(buf->conn); + bufConn->stat_total_ack_time += ackTime; + bufConn->stat_max_ack_time = Max(ackTime, bufConn->stat_max_ack_time); + bufConn->stat_min_ack_time = Min(ackTime, bufConn->stat_min_ack_time); + + /* + * only change receivedAckSeq when it is the smallest pkt we sent and have + * not received ack for it. + */ + if (bufIsHead) + this->receivedAckSeq = buf->pkt->seq; + + /* The first packet acts like a connect setup packet */ + if (buf->pkt->seq == 1) + this->state = mcsStarted; + + snd_buffer_pool.freeList.append(buf); + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "REMOVEPKT %d from unack queue for route %d (retry %d) sndbufmaxcount %d sndbufcount %d " + "sndbuffreelistlen %d, sntSeq %d consumedSeq %d recvAckSeq %d capacity %d, sndQ %d, unackQ %d", + buf->pkt->seq, this->route, buf->nRetry, snd_buffer_pool.maxCount, snd_buffer_pool.count, + snd_buffer_pool.freeList.length(), bufConn->sentSeq, bufConn->consumedSeq, + bufConn->receivedAckSeq, bufConn->capacity, bufConn->sndQueue.length(), + bufConn->unackQueue.length()); + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + { + bufConn->unackQueue.icBufferListLog(); + bufConn->sndQueue.icBufferListLog(); + } +#endif +} + +/* + * dispatcherAYT + * Check the connection from the dispatcher to verify that it is still there. + * + * The connection is a struct Port, stored in the global MyProcPort. + * + * Return true if the dispatcher connection is still alive. + */ +static bool +dispatcherAYT(void) +{ + ssize_t ret; + char buf; + + /* + * For background worker or auxiliary process like gdd, there is no client. + * As a result, MyProcPort is NULL. We should skip dispatcherAYT check here. + */ + if (global_param.MyProcPort == false) + return true; + + if (global_param.myprocport_sock < 0) + return false; + +#ifndef WIN32 + ret = recv(global_param.myprocport_sock, &buf, 1, MSG_PEEK | MSG_DONTWAIT); +#else + ret = recv(global_param.myprocport_sock, &buf, 1, MSG_PEEK | MSG_PARTIAL); +#endif + + if (ret == 0) /* socket has been closed. EOF */ + return false; + + if (ret > 0) /* data waiting on socket, it must be OK. */ + return true; + + if (ret == -1) /* error, or would be block. */ + { + if (errno == EAGAIN || errno == EINPROGRESS) + return true; /* connection intact, no data available */ + else + return false; + } + /* not reached */ + + return true; +} + +/* + * checkQDConnectionAlive + * Check whether QD connection is still alive. If not, report error. + */ +static void +checkQDConnectionAlive(void) +{ + if (!dispatcherAYT()) + { + if (global_param.Gp_role == GP_ROLE_EXECUTE_IC) + throw ICNetworkException("interconnect error segment lost contact with master (recv)", __FILE__, __LINE__); + else + throw ICNetworkException("interconnect error master lost contact with client (recv)", __FILE__, __LINE__); + } +} + +/* + * getCurrentTime + * get current time + * + */ +static uint64 +getCurrentTime(void) +{ + struct timeval newTime; + int status = 1; + uint64 t = 0; + +#if HAVE_LIBRT + /* Use clock_gettime to return monotonic time value. */ + struct timespec ts; + + status = clock_gettime(CLOCK_MONOTONIC, &ts); + + newTime.tv_sec = ts.tv_sec; + newTime.tv_usec = ts.tv_nsec / 1000; + +#endif + + if (status != 0) + gettimeofday(&newTime, NULL); + + t = ((uint64) newTime.tv_sec) * USECS_PER_SECOND + newTime.tv_usec; + return t; +} + +/* + * putIntoUnackQueueRing + * Put the buffer into the ring. + * + * expTime - expiration time from now + * + */ +static void +putIntoUnackQueueRing(UnackQueueRing *uqr, ICBuffer *buf, uint64 expTime, uint64 now) +{ + UDPConn *buffConn = static_cast(buf->conn); + uint64 diff = 0; + int idx = 0; + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + { + /* The first packet, currentTime is not initialized */ +#ifndef TIMEOUT_Z + if (uqr->currentTime == 0) + uqr->currentTime = now - (now % TIMER_SPAN_LOSS); +#else + if (uqr->currentTime == 0 && buffConn->rttvar.rto == 0) + uqr->currentTime = now - (now % TIMER_SPAN_LOSS); + else + uqr->currentTime = now + buffConn->rttvar.rto; + +#endif + diff = expTime; + if (diff >= UNACK_QUEUE_RING_LENGTH_LOSS) + { +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "putIntoUnackQueueRing:" "now " UINT64_FORMAT "expTime " UINT64_FORMAT "diff " UINT64_FORMAT "uqr-currentTime " UINT64_FORMAT, now, expTime, diff, uqr->currentTime); +#endif + diff = UNACK_QUEUE_RING_LENGTH_LOSS - 1; + } + else if (diff < TIMER_SPAN_LOSS) + { + diff = diff < TIMER_SPAN_LOSS ? TIMER_SPAN_LOSS : diff; + } + } + else + { + if (uqr->currentTime == 0) + uqr->currentTime = now - (now % TIMER_SPAN_LOSS); + + diff = now + expTime - uqr->currentTime; + if (diff >= UNACK_QUEUE_RING_LENGTH) + { +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "putIntoUnackQueueRing:" "now " UINT64_FORMAT "expTime " UINT64_FORMAT "diff " UINT64_FORMAT "uqr-currentTime " UINT64_FORMAT, now, expTime, diff, uqr->currentTime); +#endif + diff = UNACK_QUEUE_RING_LENGTH - 1; + } + else if (diff < TIMER_SPAN) + { + diff = TIMER_SPAN; + } + + idx = (uqr->idx + diff / TIMER_SPAN) % UNACK_QUEUE_RING_SLOTS_NUM; + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "PUTTW: curtime " UINT64_FORMAT " now " UINT64_FORMAT " (diff " UINT64_FORMAT ") expTime " UINT64_FORMAT " previdx %d, nowidx %d, nextidx %d", uqr->currentTime, now, diff, expTime, buf->unackQueueRingSlot, uqr->idx, idx); +#endif + } + + idx = (uqr->idx + diff / TIMER_SPAN) % UNACK_QUEUE_RING_SLOTS_NUM; + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "PUTTW: curtime %lu now %lu (diff %lu) expTime %lu previdx %d, nowidx %d, nextidx %d", uqr->currentTime, now, diff, expTime, buf->unackQueueRingSlot, uqr->idx, idx); +#endif + + buf->unackQueueRingSlot = idx; + unack_queue_ring.slots[idx].append(buf); +} + +/* + * handleDataPacket + * Handling the data packet. + * + * On return, will set *wakeup_mainthread, if a packet was received successfully + * and the caller should wake up the main thread, after releasing the mutex. + */ +static bool +handleDataPacket(UDPConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, socklen_t *peerlen, + AckSendParam *param, bool *wakeup_mainthread) +{ + if ((pkt->len == sizeof(icpkthdr)) && (pkt->flags & UDPIC_FLAGS_CAPACITY)) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "status queuy message received, seq %d, srcpid %d, dstpid %d, icid %d, sid %d", + pkt->seq, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->sessionId); + +#ifdef AMS_VERBOSE_LOGGING + logPkt("STATUS QUERY MESSAGE", pkt); +#endif + uint32 seq = conn->conn_info.seq > 0 ? conn->conn_info.seq - 1 : 0; + uint32 extraSeq = conn->stopRequested ? seq : conn->conn_info.extraSeq; + + conn->setAckParam(param, UDPIC_FLAGS_CAPACITY | UDPIC_FLAGS_ACK | conn->conn_info.flags, seq, extraSeq); + + return false; + } + + /* + * when we're not doing a full-setup on every statement, we've got to + * update the peer info -- full setups do this at setup-time. + */ + + /* + * Note the change here, for process start race and disordered message, if + * we do not fill in peer address, then we may send some acks to unknown + * address. Thus, the following condition is used. + * + */ + if (pkt->seq <= conn->pkt_q_capacity) + { + /* fill in the peer. Need to cast away "volatile". ugly */ + memset((void *) &conn->peer, 0, sizeof(conn->peer)); + memcpy((void *) &conn->peer, peer, *peerlen); + conn->peer_len = *peerlen; + + conn->conn_info.dstListenerPort = pkt->dstListenerPort; + if (IC_DEBUG2 >= session_param.log_min_messages) + LOG(DEBUG2, "received the head packets when eliding setup, pkt seq %d", pkt->seq); + } + + /* data packet */ + if (pkt->flags & UDPIC_FLAGS_EOS) + { + if (IC_DEBUG3 >= session_param.log_min_messages) + LOG(DEBUG3, "received packet with EOS motid %d route %d seq %d", + pkt->motNodeId, conn->route, pkt->seq); + } + + /* + * if we got a stop, but didn't request a stop -- ignore, this is a + * startup blip: we must have acked with a stop -- we don't want to do + * anything further with the stop-message if we didn't request a stop! + * + * this is especially important after eliding setup is enabled. + */ + if (!conn->stopRequested && (pkt->flags & UDPIC_FLAGS_STOP)) + { + if (pkt->flags & UDPIC_FLAGS_EOS) + { + LOG(INFO, "non-requested stop flag, EOS! seq %d, flags 0x%x", pkt->seq, pkt->flags); + } + return false; + } + + if (conn->stopRequested && conn->stillActive) + { + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC && IC_DEBUG5 >= session_param.log_min_messages) + LOG(DEBUG5, "rx_thread got packet on active connection marked stopRequested. " + "(flags 0x%x) node %d route %d pkt seq %d conn seq %d", + pkt->flags, pkt->motNodeId, conn->route, pkt->seq, conn->conn_info.seq); + + /* can we update stillActive ? */ + if (IC_DEBUG2 >= session_param.log_min_messages) + if (!(pkt->flags & UDPIC_FLAGS_STOP) && !(pkt->flags & UDPIC_FLAGS_EOS)) + LOG(DEBUG2, "stop requested but no stop flag on return packet ?!"); + + if (pkt->flags & UDPIC_FLAGS_EOS) + conn->conn_info.flags |= UDPIC_FLAGS_EOS; + + if (conn->conn_info.seq < pkt->seq) + conn->conn_info.seq = pkt->seq; /* note here */ + + conn->setAckParam(param, UDPIC_FLAGS_ACK | UDPIC_FLAGS_STOP | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, pkt->seq, pkt->seq); + + /* we only update stillActive if eos has been sent by peer. */ + if (pkt->flags & UDPIC_FLAGS_EOS) + { + if (IC_DEBUG2 >= session_param.log_min_messages) + LOG(DEBUG2, "stop requested and acknowledged by sending peer"); + conn->stillActive = false; + } + + return false; + } + + /* dropped ack or timeout */ + if (pkt->seq < conn->conn_info.seq) + { + ic_statistics.duplicatedPktNum++; + if (IC_DEBUG3 >= session_param.log_min_messages) + LOG(DEBUG3, "dropped ack ? ignored data packet w/ cmd %d conn->cmd %d node %d route %d seq %d expected %d flags 0x%x", + pkt->icId, conn->conn_info.icId, pkt->motNodeId, conn->route, pkt->seq, conn->conn_info.seq, pkt->flags); + + conn->setAckParam(param, UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, conn->conn_info.seq - 1, conn->conn_info.extraSeq); + + return false; + } + + /* sequence number is correct */ + if (!conn->stillActive) + { + /* peer may have dropped ack */ + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_VERBOSE_IC && + IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "received on inactive connection node %d route %d (seq %d pkt->seq %d)", + pkt->motNodeId, conn->route, conn->conn_info.seq, pkt->seq); + + if (conn->conn_info.seq < pkt->seq) + conn->conn_info.seq = pkt->seq; + conn->setAckParam(param, UDPIC_FLAGS_ACK | UDPIC_FLAGS_STOP | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, pkt->seq, pkt->seq); + + return false; + } + + /* headSeq is the seq for the head packet. */ + uint32 headSeq = conn->conn_info.seq - conn->pkt_q_size; + + if ((conn->pkt_q_size == conn->pkt_q_capacity) || (pkt->seq - headSeq >= conn->pkt_q_capacity)) + { + /* + * Error case: NO RX SPACE or out of range pkt This indicates a bug. + */ + logPkt("Interconnect error: received a packet when the queue is full ", pkt); + ic_statistics.disorderedPktNum++; + conn->stat_count_dropped++; + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER_IC && rx_control_info.mainWaitingState.waiting && + rx_control_info.mainWaitingState.waitingNode == pkt->motNodeId && + rx_control_info.mainWaitingState.waitingQuery == pkt->icId) + { + if (rx_control_info.mainWaitingState.waitingRoute == ANY_ROUTE) + { + if (rx_control_info.mainWaitingState.reachRoute == ANY_ROUTE) + rx_control_info.mainWaitingState.reachRoute = conn->route; + } + else if (rx_control_info.mainWaitingState.waitingRoute == conn->route) + { + if (IC_DEBUG2 >= session_param.log_min_messages) + LOG(INFO, "rx thread: main_waiting waking it route %d", rx_control_info.mainWaitingState.waitingRoute); + rx_control_info.mainWaitingState.reachRoute = conn->route; + } + /* WAKE MAIN THREAD HERE */ + *wakeup_mainthread = true; + } + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + { + conn->setAckParam(param, UDPIC_FLAGS_FULL, conn->conn_info.seq - 1, conn->conn_info.extraSeq); + } + return false; + } + + /* put the packet at the his position */ + bool toWakeup = false; + + int pos = (pkt->seq - 1) % conn->pkt_q_capacity; + + if (conn->pkt_q[pos] == NULL) + { + conn->pkt_q[pos] = (uint8 *) pkt; + if (pos == conn->pkt_q_head) + { +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "SAVE pkt at QUEUE HEAD [seq %d] for node %d route %d, queue head seq %d, queue size %d, queue head %d queue tail %d", + pkt->seq, pkt->motNodeId, conn->route, headSeq, conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); +#endif + toWakeup = true; + } + + if (pos == conn->pkt_q_tail) + { + /* move the queue tail */ + for (; conn->pkt_q[conn->pkt_q_tail] != NULL && conn->pkt_q_size < conn->pkt_q_capacity;) + { + conn->pkt_q_size++; + conn->pkt_q_tail = (conn->pkt_q_tail + 1) % conn->pkt_q_capacity; + conn->conn_info.seq++; + } + + /* set the EOS flag */ + if (((icpkthdr *) (conn->pkt_q[(conn->pkt_q_tail + conn->pkt_q_capacity - 1) % conn->pkt_q_capacity]))->flags & UDPIC_FLAGS_EOS) + { + conn->conn_info.flags |= UDPIC_FLAGS_EOS; + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "RX_THREAD: the packet with EOS flag is available for access in the queue for route %d", conn->route); + } + + /* ack data packet */ + conn->setAckParam(param, UDPIC_FLAGS_CAPACITY | UDPIC_FLAGS_ACK | conn->conn_info.flags, conn->conn_info.seq - 1, conn->conn_info.extraSeq); + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "SAVE conn %p pkt at QUEUE TAIL [seq %d] at pos [%d] for node %d route %d, [head seq] %d, queue size %d, queue head %d queue tail %d", + conn, pkt->seq, pos, pkt->motNodeId, conn->route, headSeq, conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); +#endif + } + else /* deal with out-of-order packet */ + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "SAVE conn %p OUT-OF-ORDER pkt [seq %d] at pos [%d] for node %d route %d, [head seq] %d, queue size %d, queue head %d queue tail %d", + conn, pkt->seq, pos, pkt->motNodeId, conn->route, headSeq, conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); + + /* send an ack for out-of-order packet */ + ic_statistics.disorderedPktNum++; + conn->handleDisorderPacket(pos, headSeq + conn->pkt_q_size, pkt); + } + } + else /* duplicate pkt */ + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "DUPLICATE pkt [seq %d], [head seq] %d, queue size %d, queue head %d queue tail %d", + pkt->seq, headSeq, conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); + + conn->setAckParam(param, UDPIC_FLAGS_DUPLICATE | conn->conn_info.flags, pkt->seq, conn->conn_info.seq - 1); + ic_statistics.duplicatedPktNum++; + return false; + } + + /* Was the main thread waiting for something ? */ + if (rx_control_info.mainWaitingState.waiting && + rx_control_info.mainWaitingState.waitingNode == pkt->motNodeId && + rx_control_info.mainWaitingState.waitingQuery == pkt->icId && toWakeup) + { + if (rx_control_info.mainWaitingState.waitingRoute == ANY_ROUTE) + { + if (rx_control_info.mainWaitingState.reachRoute == ANY_ROUTE) + rx_control_info.mainWaitingState.reachRoute = conn->route; + } + else if (rx_control_info.mainWaitingState.waitingRoute == conn->route) + { + if (IC_DEBUG2 >= session_param.log_min_messages) + LOG(DEBUG2, "rx thread: main_waiting waking it route %d", rx_control_info.mainWaitingState.waitingRoute); + rx_control_info.mainWaitingState.reachRoute = conn->route; + } + /* WAKE MAIN THREAD HERE */ + *wakeup_mainthread = true; + } + + return true; +} + +/* + * rxThreadFunc + * Main function of the receive background thread. + * + * NOTE: This function MUST NOT contain elog or ereport statements. + * elog is NOT thread-safe. Developers should instead use something like: + * + * NOTE: In threads, we cannot use palloc/pfree, because it's not thread safe. + */ +static void * +rxThreadFunc(void *arg) +{ + icpkthdr *pkt = NULL; + bool skip_poll = false; + + for (;;) + { + struct pollfd nfd; + int n; + + /* check shutdown condition */ + if (ic_atomic_read_u32(&ic_control_info.shutdown) == 1) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "udp-ic: rx-thread shutting down"); + break; + } + + /* Try to get a buffer */ + if (pkt == NULL) + { + pthread_mutex_lock(&ic_control_info.lock); + pkt = rx_buffer_pool.get(); + pthread_mutex_unlock(&ic_control_info.lock); + + if (pkt == NULL) + { + setRxThreadError(ENOMEM); + continue; + } + } + + if (!skip_poll) + { + /* Do we have inbound traffic to handle ? */ + nfd.fd = UDP_listenerFd; + nfd.events = POLLIN; + + n = poll(&nfd, 1, RX_THREAD_POLL_TIMEOUT); + + if (ic_atomic_read_u32(&ic_control_info.shutdown) == 1) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "udp-ic: rx-thread shutting down"); + break; + } + + if (n < 0) + { + if (errno == EINTR) + continue; + + /* + * ERROR case: if simply break out the loop here, there will + * be a hung here, since main thread will never be waken up, + * and senders will not get responses anymore. + * + * Thus, we set an error flag, and let main thread to report + * an error. + */ + setRxThreadError(errno); + continue; + } + + if (n == 0) + continue; + } + + if (skip_poll || (n == 1 && (nfd.events & POLLIN))) + { + /* we've got something interesting to read */ + /* handle incoming */ + /* ready to read on our socket */ + int read_count = 0; + + struct sockaddr_storage peer; + socklen_t peerlen; + + peerlen = sizeof(peer); + read_count = recvfrom(UDP_listenerFd, (char *) pkt, global_param.Gp_max_packet_size, 0, + (struct sockaddr *) &peer, &peerlen); + + if (ic_atomic_read_u32(&ic_control_info.shutdown) == 1) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "udp-ic: rx-thread shutting down"); + break; + } + + if (IC_DEBUG5 >= session_param.log_min_messages) + LOG(DEBUG5, "received inbound len %d", read_count); + + if (read_count < 0) + { + skip_poll = false; + + if (errno == EWOULDBLOCK || errno == EINTR) + continue; + + LOG(LOG_ERROR, "Interconnect error: recvfrom (%d)", errno); + + /* + * ERROR case: if simply break out the loop here, there will + * be a hung here, since main thread will never be waken up, + * and senders will not get responses anymore. + * + * Thus, we set an error flag, and let main thread to report + * an error. + */ + setRxThreadError(errno); + continue; + } + + if (static_cast(read_count) < sizeof(icpkthdr)) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "Interconnect error: short conn receive (%d)", read_count); + continue; + } + + /* + * when we get a "good" recvfrom() result, we can skip poll() + * until we get a bad one. + */ + skip_poll = true; + + /* length must be >= 0 */ + if (pkt->len < 0) + { + if (IC_DEBUG3 >= session_param.log_min_messages) + LOG(DEBUG3, "received inbound with negative length"); + continue; + } + + if (pkt->len != static_cast(read_count)) + { + if (IC_DEBUG3 >= session_param.log_min_messages) + LOG(DEBUG3, "received inbound packet [%d], short: read %d bytes, pkt->len %d", pkt->seq, read_count, pkt->len); + continue; + } + + /* + * check the CRC of the payload. + */ + if (session_param.gp_interconnect_full_crc) + { + if (!checkCRC(pkt)) + { + ic_atomic_add_fetch_u32((ic_atomic_uint32 *) &ic_statistics.crcErrors, 1); + if (IC_DEBUG2 >= session_param.log_min_messages) + LOG(DEBUG2, "received network data error, dropping bad packet, user data unaffected."); + continue; + } + } + +#ifdef AMS_VERBOSE_LOGGING + logPkt("GOT MESSAGE", pkt); +#endif + + bool wakeup_mainthread = false; + AckSendParam param; + + memset(¶m, 0, sizeof(AckSendParam)); + + /* + * Get the connection for the pkt. + * + * The connection hash table should be locked until finishing the + * processing of the packet to avoid the connection + * addition/removal from the hash table during the mean time. + */ + pthread_mutex_lock(&ic_control_info.lock); + UDPConn *conn = ic_control_info.connHtab.find(pkt); + if (conn != NULL) + { + uint64 now = getCurrentTime(); + uint64 send_time = pkt->send_time; + uint64 recv_time = now; + uint64 retry_times = pkt->retry_times; + + bool drop_ack = pkt->seq < conn->conn_info.seq ? true : false; + /* Handling a regular packet */ + if (handleDataPacket(conn, pkt, &peer, &peerlen, ¶m, &wakeup_mainthread)) + pkt = NULL; + if (!pkt) + { + param.msg.send_time = send_time; + param.msg.recv_time = recv_time; + param.msg.retry_times = retry_times; + } + if (drop_ack) + param.msg.retry_times = session_param.Gp_interconnect_min_retries_before_timeout; + ic_statistics.recvPktNum++; + } + else + { + /* + * There may have two kinds of Mismatched packets: a) Past + * packets from previous command after I was torn down b) + * Future packets from current command before my connections + * are built. + * + * The handling logic is to "Ack the past and Nak the future". + */ + if ((pkt->flags & UDPIC_FLAGS_RECEIVER_TO_SENDER) == 0) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "mismatched packet received, seq %d, srcpid %d, dstpid %d, icid %d, sid %d", + pkt->seq, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->sessionId); + +#ifdef AMS_VERBOSE_LOGGING + logPkt("Got a Mismatched Packet", pkt); +#endif + + if (handleMismatch(pkt, &peer, peerlen)) + pkt = NULL; + ic_statistics.mismatchNum++; + } + } + pthread_mutex_unlock(&ic_control_info.lock); + + if (wakeup_mainthread) { + cv.notify_one(); + } + + /* + * real ack sending is after lock release to decrease the lock + * holding time. + */ + if (param.msg.len != 0) + UDPConn::sendAckWithParam(¶m); + } + + /* pthread_yield(); */ + } + + /* Before return, we release the packet. */ + if (pkt) + { + pthread_mutex_lock(&ic_control_info.lock); + rx_buffer_pool.release(pkt); + pkt = NULL; + pthread_mutex_unlock(&ic_control_info.lock); + } + + /* nothing to return */ + return NULL; +} + +/* + * handleMismatch + * If the mismatched packet is from an old connection, we may need to + * send an acknowledgment. + * + * We are called with the receiver-lock held, and we never release it. + * + * For QD: + * 1) Not in hashtable : NAK it/Do nothing + * Causes: a) Start race + * b) Before the entry for the ic instance is inserted, an error happened. + * c) From past transactions: should no happen. + * 2) Active in hashtable : NAK it/Do nothing + * Causes: a) Error reported after the entry is inserted, and connections are + * not inserted to the hashtable yet, and before teardown is called. + * 3) Inactive in hashtable: ACK it (with stop) + * Causes: a) Normal execution: after teardown is called on current command. + * b) Error case, 2a) after teardown is called. + * c) Normal execution: from past history transactions (should not happen). + * + * For QE: + * 1) pkt->id > ic_control_info.ic_instance_id : NAK it/Do nothing + * Causes: a) Start race + * b) Before ic_control_info.ic_instance_id is assigned to correct value, an error happened. + * 2) lastTornIcId < pkt->id == ic_control_info.ic_instance_id: NAK it/Do nothing + * Causes: a) Error reported after ic_control_info.ic_instance_id is set, and connections are + * not inserted to the hashtable yet, and before teardown is called. + * 3) lastTornIcId == pkt->id == ic_control_info.ic_instance_id: ACK it (with stop) + * Causes: a) Normal execution: after teardown is called on current command + * 4) pkt->id < ic_control_info.ic_instance_id: NAK it/Do nothing/ACK it. + * Causes: a) Should not happen. + * + */ +static bool +handleMismatch(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len) +{ + bool cached = false; + + /* + * we want to ack old packets; but *must* avoid acking connection + * requests: + * + * "ACK the past, NAK the future" explicit NAKs aren't necessary, we just + * don't want to ACK future packets, that confuses everyone. + */ + if (pkt->seq > 0 && pkt->sessionId == session_param.gp_session_id) + { + bool need_ack = false; + uint8 ack_flags = 0; + + /* + * The QD-backends can't use a counter, they've potentially got + * multiple instances (one for each active cursor) + */ + if (global_param.Gp_role == GP_ROLE_DISPATCH_IC) + { + struct CursorICHistoryEntry *p; + + p = rx_control_info.cursorHistoryTable.get(pkt->icId); + if (p) + { + if (p->status == 0) + { + /* Torn down. Ack the past. */ + need_ack = true; + } + else /* p->status == 1 */ + { + /* + * Not torn down yet. It happens when an error + * (out-of-memory, network error...) occurred after the + * cursor entry is inserted into the table in interconnect + * setup process. The peer will be canceled. + */ + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "GOT A MISMATCH PACKET WITH ID %d HISTORY THINKS IT IS ACTIVE", pkt->icId); + return cached; /* ignore, no ack */ + } + } + else + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "GOT A MISMATCH PACKET WITH ID %d HISTORY HAS NO RECORD", pkt->icId); + + /* + * No record means that two possibilities. 1) It is from the + * future. It is due to startup race. We do not ack future + * packets 2) Before the entry for the ic instance is + * inserted, an error happened. We do not ack for this case + * too. The peer will be canceled. + */ + ack_flags = UDPIC_FLAGS_NAK; + need_ack = false; + + if (session_param.gp_interconnect_cache_future_packets) + { + cached = cacheFuturePacket(pkt, peer, peer_len); + } + } + } + /* The QEs get to use a simple counter. */ + else if (global_param.Gp_role == GP_ROLE_EXECUTE_IC) + { + if (ic_control_info.ic_instance_id >= pkt->icId) + { + need_ack = true; + + /* + * We want to "ACK the past, but NAK the future." + * + * handleAck() will retransmit. + */ + if (pkt->seq >= 1 && pkt->icId > rx_control_info.lastTornIcId) + { + ack_flags = UDPIC_FLAGS_NAK; + need_ack = false; + } + } + else + { + /* + * ic_control_info.ic_instance_id < pkt->icId, from the future + */ + if (session_param.gp_interconnect_cache_future_packets) + { + cached = cacheFuturePacket(pkt, peer, peer_len); + } + } + } + + if (need_ack) + { + UDPConn dummyconn(NULL); + char buf[128]; /* numeric IP addresses shouldn't exceed + * about 50 chars, but play it safe */ + + memcpy(&dummyconn.conn_info, pkt, sizeof(icpkthdr)); + dummyconn.peer = *peer; + dummyconn.peer_len = peer_len; + + dummyconn.conn_info.flags |= ack_flags; + + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "ACKING PACKET WITH FLAGS: pkt->seq %d 0x%x [pkt->icId %d last-teardown %d interconnect_id %d]", + pkt->seq, dummyconn.conn_info.flags, pkt->icId, rx_control_info.lastTornIcId, ic_control_info.ic_instance_id); + + format_sockaddr_udp(&dummyconn.peer, buf, sizeof(buf)); + + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "ACKING PACKET TO %s", buf); + + if ((ack_flags & UDPIC_FLAGS_NAK) == 0) + { + ack_flags |= UDPIC_FLAGS_STOP | UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | UDPIC_FLAGS_RECEIVER_TO_SENDER; + } + else + { + ack_flags |= UDPIC_FLAGS_RECEIVER_TO_SENDER; + } + + /* + * There are two cases, we may need to send a response to sender + * here. One is start race and the other is receiver becomes idle. + * + * ack_flags here can take two possible values 1) UDPIC_FLAGS_NAK + * | UDPIC_FLAGS_RECEIVER_TO_SENDER (for start race) 2) + * UDPIC_FLAGS_STOP | UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | + * UDPIC_FLAGS_RECEIVER_TO_SENDER (for idle receiver) + * + * The final flags in the packet may take some extra bits such as + * 1) UDPIC_FLAGS_STOP 2) UDPIC_FLAGS_EOS 3) UDPIC_FLAGS_CAPACITY + * which are from original packet + */ + dummyconn.sendAck(ack_flags | dummyconn.conn_info.flags, dummyconn.conn_info.seq, dummyconn.conn_info.seq); + } + } + else + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "dropping packet from command-id %d seq %d (my cmd %d)", pkt->icId, pkt->seq, ic_control_info.ic_instance_id); + } + + return cached; +} + +/* + * cacheFuturePacket + * Cache the future packets during the setupUDPIFCInterconnect. + * + * Return true if packet is cached, otherwise false + */ +static bool +cacheFuturePacket(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len) +{ + UDPConn *conn = ic_control_info.startupCacheHtab.find(pkt); + if (conn == NULL) + { + try { + conn = new UDPConn(NULL); + } catch (const std::bad_alloc & e) { + errno = ENOMEM; + setRxThreadError(errno); + return false; + } + + memset((void *) conn, 0, sizeof(UDPConn)); + memcpy(&conn->conn_info, pkt, sizeof(icpkthdr)); + + conn->pkt_q_capacity = session_param.Gp_interconnect_queue_depth; + conn->pkt_q_size = session_param.Gp_interconnect_queue_depth; + conn->pkt_q = (uint8 **) ic_malloc(session_param.Gp_interconnect_queue_depth * sizeof(uint8 *)); + + if (conn->pkt_q == NULL) + { + /* malloc failed. */ + delete conn; + setRxThreadError(errno); + return false; + } + + /* We only use the array to store cached packets. */ + memset(conn->pkt_q, 0, session_param.Gp_interconnect_queue_depth * sizeof(uint8 *)); + + /* Put connection to the hashtable. */ + if (!ic_control_info.startupCacheHtab.add(conn)) + { + ic_free(conn->pkt_q); + delete conn; + setRxThreadError(errno); + return false; + } + + /* Setup the peer sock information. */ + memcpy(&conn->peer, peer, peer_len); + conn->peer_len = peer_len; + } + + /* + * Reject packets with invalid sequence numbers and packets which have + * been cached before. + */ + if (pkt->seq > conn->pkt_q_size || pkt->seq == 0 || conn->pkt_q[pkt->seq - 1] != NULL) + return false; + + conn->pkt_q[pkt->seq - 1] = (uint8 *) pkt; + rx_buffer_pool.maxCount++; + ic_statistics.startupCachedPktNum++; + + return true; +} + +/* + * cleanupStartupCache + * Clean the startup cache. + */ +static void +cleanupStartupCache() +{ + ConnHtabBin *bin = NULL; + UDPConn *cachedConn = NULL; + icpkthdr *pkt = NULL; + int i = 0; + uint32 j = 0; + + for (i = 0; i < ic_control_info.startupCacheHtab.size; i++) + { + bin = ic_control_info.startupCacheHtab.table[i]; + + while (bin) + { + cachedConn = bin->conn; + + for (j = 0; j < cachedConn->pkt_q_size; j++) + { + pkt = (icpkthdr *) cachedConn->pkt_q[j]; + + if (pkt == NULL) + continue; + + rx_buffer_pool.maxCount--; + rx_buffer_pool.put(pkt); + cachedConn->pkt_q[j] = NULL; + } + bin = bin->next; + ic_control_info.startupCacheHtab.remove(cachedConn); + + /* + * MPP-19981 free the cached connections; otherwise memory leak + * would be introduced. + */ + ic_free(cachedConn->pkt_q); + ic_free(cachedConn); + } + } +} + + +#ifdef USE_ASSERT_CHECKING + +/* The following functions are facility methods for debugging. + * They are quite useful when there are a large number of connections. + * These functions can be called from gdb to output internal information to a file. + */ + +/* + * dumpUnackQueueRing + * Dump an unack queue ring. + */ +static void +dumpUnackQueueRing(const char *fname) +{ + FILE *ofile = fopen(fname, "w+"); + int i; + + fprintf(ofile, "UnackQueueRing: currentTime %lu, idx %d numOutstanding %d numSharedOutstanding %d\n", + unack_queue_ring.currentTime, unack_queue_ring.idx, + unack_queue_ring.numOutStanding, unack_queue_ring.numSharedOutStanding); + fprintf(ofile, "==================================\n"); + for (i = 0; i < UNACK_QUEUE_RING_SLOTS_NUM; i++) + { + if (unack_queue_ring.slots[i].length() > 0) + { + unack_queue_ring.slots[i].dump_to_file(ofile); + } + } + + fclose(ofile); +} + +/* + * dumpConnections + * Dump connections. + */ +void +TransportEntry::dumpConnections(const char *fname) +{ + int i; + uint32 j; + + return; + + FILE *ofile = fopen(fname, "w+"); + + fprintf(ofile, "Entry connections: conn num %d \n", this->numConns); + fprintf(ofile, "==================================\n"); + for (i = 0; i < this->numConns; i++) + { + UDPConn *conn = this->GetConn(i); + + fprintf(ofile, "conns[%d] motNodeId=%d: remoteContentId=%d pid=%d sockfd=%d remote=%s " + "capacity=%d sentSeq=%d receivedAckSeq=%d consumedSeq=%d rtt=%lu" + " dev=%lu deadlockCheckBeginTime=%lu route=%d msgSize=%d msgPos=%p" + " recvBytes=%d tupleCount=%d stillActive=%d stopRequested=%d " + "state=%d\n", + i, this->motNodeId, + conn->remoteContentId, + conn->cdbProc ? conn->cdbProc->pid : 0, + conn->sockfd, + conn->remoteHostAndPort, + conn->capacity, conn->sentSeq, conn->receivedAckSeq, conn->consumedSeq, + conn->rtt, conn->dev, conn->deadlockCheckBeginTime, conn->route, conn->msgSize, conn->msgPos, + conn->recvBytes, conn->tupleCount, conn->stillActive, conn->stopRequested, + conn->state); + fprintf(ofile, "conn_info [%s: seq %d extraSeq %d]: motNodeId %d, crc %d len %d " + "srcContentId %d dstDesContentId %d " + "srcPid %d dstPid %d " + "srcListenerPort %d dstListernerPort %d " + "sendSliceIndex %d recvSliceIndex %d " + "sessionId %d icId %d " + "flags %d\n", + conn->conn_info.flags & UDPIC_FLAGS_RECEIVER_TO_SENDER ? "ACK" : "DATA", + conn->conn_info.seq, conn->conn_info.extraSeq, conn->conn_info.motNodeId, conn->conn_info.crc, conn->conn_info.len, + conn->conn_info.srcContentId, conn->conn_info.dstContentId, + conn->conn_info.srcPid, conn->conn_info.dstPid, + conn->conn_info.srcListenerPort, conn->conn_info.dstListenerPort, + conn->conn_info.sendSliceIndex, conn->conn_info.recvSliceIndex, + conn->conn_info.sessionId, conn->conn_info.icId, + conn->conn_info.flags); + + if (!ic_control_info.isSender) + { + fprintf(ofile, "pkt_q_size=%d pkt_q_head=%d pkt_q_tail=%d pkt_q=%p\n", conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail, conn->pkt_q); + for (j = 0; j < conn->pkt_q_capacity; j++) + { + if (conn->pkt_q != NULL && conn->pkt_q[j] != NULL) + { + icpkthdr *pkt = (icpkthdr *) conn->pkt_q[j]; + + fprintf(ofile, "Packet (pos %d) Info [%s: seq %d extraSeq %d]: motNodeId %d, crc %d len %d " + "srcContentId %d dstDesContentId %d " + "srcPid %d dstPid %d " + "srcListenerPort %d dstListernerPort %d " + "sendSliceIndex %d recvSliceIndex %d " + "sessionId %d icId %d " + "flags %d\n", + j, + pkt->flags & UDPIC_FLAGS_RECEIVER_TO_SENDER ? "ACK" : "DATA", + pkt->seq, pkt->extraSeq, pkt->motNodeId, pkt->crc, pkt->len, + pkt->srcContentId, pkt->dstContentId, + pkt->srcPid, pkt->dstPid, + pkt->srcListenerPort, pkt->dstListenerPort, + pkt->sendSliceIndex, pkt->recvSliceIndex, + pkt->sessionId, pkt->icId, + pkt->flags); + } + } + } + if (ic_control_info.isSender) + { + fprintf(ofile, "sndQueue "); + conn->sndQueue.dump_to_file(ofile); + fprintf(ofile, "unackQueue "); + conn->unackQueue.dump_to_file(ofile); + + dumpUnackQueueRing("/tmp/dumpUnackQueueRing"); + } + fprintf(ofile, "\n"); + } + fclose(ofile); +} +#endif + +/* + * logPkt + * Log a packet. + * + */ +static inline void +logPkt(const char *prefix, icpkthdr *pkt) +{ + LOG(INFO, "%s [%s: seq %d extraSeq %d]: motNodeId %d, crc %d len %d " + "srcContentId %d dstDesContentId %d " + "srcPid %d dstPid %d " + "srcListenerPort %d dstListernerPort %d " + "sendSliceIndex %d recvSliceIndex %d " + "sessionId %d icId %d " + "flags %d ", + prefix, pkt->flags & UDPIC_FLAGS_RECEIVER_TO_SENDER ? "ACK" : "DATA", + pkt->seq, pkt->extraSeq, pkt->motNodeId, pkt->crc, pkt->len, + pkt->srcContentId, pkt->dstContentId, + pkt->srcPid, pkt->dstPid, + pkt->srcListenerPort, pkt->dstListenerPort, + pkt->sendSliceIndex, pkt->recvSliceIndex, + pkt->sessionId, pkt->icId, + pkt->flags); +} + +/* + * Send a dummy packet to interconnect thread to exit poll() immediately + */ +static void +SendDummyPacket(void) +{ + int ret; + const char *dummy_pkt = "stop it"; + int counter; + struct sockaddr_storage dest; + socklen_t dest_len; + + Assert(udp_dummy_packet_sockaddr.ss_family == AF_INET || udp_dummy_packet_sockaddr.ss_family == AF_INET6); + Assert(ICSenderFamily == AF_INET || ICSenderFamily == AF_INET6); + + dest = udp_dummy_packet_sockaddr; + dest_len = (ICSenderFamily == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + + if (ICSenderFamily == AF_INET6) + { +#if defined(__darwin__) + if (udp_dummy_packet_sockaddr.ss_family == AF_INET6) + ConvertIPv6WildcardToLoopback(&dest); +#endif + if (udp_dummy_packet_sockaddr.ss_family == AF_INET) + ConvertToIPv4MappedAddr(&dest, &dest_len); + } + + if (ICSenderFamily == AF_INET && udp_dummy_packet_sockaddr.ss_family == AF_INET6) + { + /* the size of AF_INET6 is bigger than the side of IPv4, so + * converting from IPv6 to IPv4 may potentially not work. */ + LOG(INFO, "sending dummy packet failed: cannot send from AF_INET to receiving on AF_INET6"); + return; + } + + /* + * Send a dummy package to the interconnect listener, try 10 times. + * We don't want to close the socket at the end of this function, since + * the socket will eventually close during the motion layer cleanup. + */ + + counter = 0; + while (counter < 10) + { + counter++; + ret = sendto(ICSenderSocket, dummy_pkt, strlen(dummy_pkt), 0, (struct sockaddr *) &dest, dest_len); + if (ret < 0) + { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + continue; + else + { + LOG(INFO, "send dummy packet failed, sendto failed: %m"); + return; + } + } + break; + } + + if (counter >= 10) + { + LOG(INFO, "send dummy packet failed, sendto failed with 10 times: %m"); + } +} + +/* + * prepareXmit + * Prepare connection for transmit. + */ +void +UDPConn::prepareXmit() +{ + this->conn_info.len = this->msgSize; + this->conn_info.crc = 0; + + memcpy(this->pBuff, &this->conn_info, sizeof(this->conn_info)); + + /* increase the sequence no */ + this->conn_info.seq++; + + if (session_param.gp_interconnect_full_crc) + { + icpkthdr *pkt = (icpkthdr *)this->pBuff; + addCRC(pkt); + } +} + + /* + * sendtoWithRetry + * Retry sendto logic and send the packets. + */ +static ssize_t +sendtoWithRetry(int socket, const void *message, size_t length, + int flags, const struct sockaddr *dest_addr, + socklen_t dest_len, int retry, const char *errDetail) +{ + int32 n; + int count = 0; + +xmit_retry: + /* + * If given retry count is positive, retry up to the limited times. + * Otherwise, retry for unlimited times until succeed. + */ + if (retry > 0 && ++count > retry) + return n; + n = sendto(socket, message, length, flags, dest_addr, dest_len); + if (n < 0) + { + int save_errno = errno; + + if (errno == EINTR) + goto xmit_retry; + + /* + * EAGAIN: no space ? not an error. + * + * EFAULT: In Linux system call, it only happens when copying a socket + * address into kernel space failed, which is less likely to happen, + * but mocked heavily by our fault injection in regression tests. + */ + if (errno == EAGAIN || errno == EFAULT) + return n; + + /* + * If Linux iptables (nf_conntrack?) drops an outgoing packet, it may + * return an EPERM to the application. This might be simply because of + * traffic shaping or congestion, so ignore it. + */ + if (errno == EPERM) + { + LOG(LOG_ERROR, "Interconnect error writing an outgoing packet: %m, " + "error during sendto() %s", errDetail); + return n; + } + + /* + * If the OS can detect an MTU issue on the host network interfaces, we + * would get EMSGSIZE here. So, bail with a HINT about checking MTU. + */ + if (errno == EMSGSIZE) + { + std::stringstream ss; + ss << "ERROR, Interconnect error writing an outgoing packet: " << strerror(errno) << "error during sendto() call (error:" << save_errno << ", " << errDetail << ")." + << "check if interface MTU is equal across the cluster and lower than gp_max_packet_size" << "\n"; + throw ICNetworkException(ss.str(), __FILE__, __LINE__); + } + + std::stringstream ss; + ss <<"ERROR, Interconnect error writing an outgoing packet: "<seq, pkt->srcPid, pkt->dstPid); +#endif + return; + } +#endif + + Assert(pkt->srcContentId == global_param.segindex); + Assert(pkt->motNodeId == entry_->motNodeId); + LOG(DEBUG3, "UDPConn::sendOnce(): icid: %d, motNodeId: %d, srcSeg: %d, dstSeg: %d, srcPid: %d, dstPid: %d, seq: %d, len: %d, flags: %s", + pkt->icId, pkt->motNodeId, pkt->srcContentId, pkt->dstContentId, pkt->srcPid, pkt->dstPid, pkt->seq, pkt->len, flags2txt(pkt->flags)); + + char errDetail[256]; + snprintf(errDetail, sizeof(errDetail), "For Remote Connection: contentId=%d at %s", + this->remoteContentId, + this->remoteHostAndPort); + n = sendtoWithRetry(this->entry_->txfd, pkt, pkt->len, 0, + (struct sockaddr *) &this->peer, this->peer_len, -1, errDetail); + if (n != int(pkt->len)) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "Interconnect error writing an outgoing packet [seq %d]: short transmit (given %d sent %d) during sendto() call." + "For Remote Connection: contentId=%d at %s", pkt->seq, pkt->len, n, + this->remoteContentId, this->remoteHostAndPort); +#ifdef AMS_VERBOSE_LOGGING + logPkt("PKT DETAILS ", pkt); +#endif + } + return; +} + +void +UDPConn::handleStop() +{ + if (!this->stillActive || !this->stopRequested) + return; + + /* mark buffer empty */ + this->tupleCount = 0; + this->msgSize = sizeof(this->conn_info); + + /* now send our stop-ack EOS */ + this->conn_info.flags |= UDPIC_FLAGS_EOS; + + Assert(this->curBuff != NULL); + + this->pBuff[this->msgSize] = 'S'; + this->msgSize += 1; + + /* now ready to actually send */ + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "handleStopMsgs: node %d route %d, seq %d", + entry_->motNodeId, this->route, this->conn_info.seq); + + /* place it into the send queue */ + this->prepareXmit(); + this->sndQueue.append(this->curBuff); + this->curBuff = NULL; + this->pBuff = NULL; + + /* return all buffers */ + this->sndQueue.release(false); + this->unackQueue.release(session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_CAPACITY_IC ? false : true); + + this->tupleCount = 0; + this->msgSize = sizeof(this->conn_info); + + this->state = mcsEosSent; + this->stillActive = false; + this->stopRequested = false; +} + +/* + * sendBuffers + * Called by sender to send the buffers in the send queue. + * + * Send the buffers in the send queue of the connection if there is capacity left + * and the congestion control condition is satisfied. + * + * Here, we make sure that a connection can have at least one outstanding buffer. + * This is very important for two reasons: + * + * 1) The handling logic of the ack of the outstanding buffer can always send a buffer + * in the send queue. Otherwise, there may be a deadlock. + * 2) This makes sure that any connection can have a minimum bandwidth for data + * sending. + * + * After sending a buffer, the buffer will be placed into both the unack queue and + * the corresponding queue in the unack queue ring. + */ +void +UDPConn::sendBuffers() +{ + while (this->capacity > 0 && this->sndQueue.length() > 0) + { + ICBuffer *buf = NULL; + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_IC || session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + { + if (this->unackQueue.length() > 0 && + unack_queue_ring.numSharedOutStanding >= (snd_control_info.cwnd - snd_control_info.minCwnd)) + break; + } + + /* for connection setup, we only allow one outstanding packet. */ + if (this->state == mcsSetupOutgoingConnection && this->unackQueue.length() >= 1) + break; + + buf = this->sndQueue.pop(); + + uint64 now = getCurrentTime(); + + buf->sentTime = now; + buf->unackQueueRingSlot = -1; + buf->nRetry = 0; + buf->conn = this; + this->capacity--; + + this->unackQueue.append(buf); + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_IC || session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + { + unack_queue_ring.numOutStanding++; + if (this->unackQueue.length() > 1) + unack_queue_ring.numSharedOutStanding++; + + putIntoUnackQueueRing(&unack_queue_ring, + buf, + this->computeExpirationPeriod(buf->nRetry), + now); + } + + /* + * Note the place of sendOnce here. If we send before appending it to + * the unack queue and putting it into unack queue ring, and there is + * a network error occurred in the sendOnce function, error message + * will be output. In the time of error message output, interrupts is + * potentially checked, if there is a pending query cancel, it will + * lead to a dangled buffer (memory leak). + */ +#ifdef TRANSFER_PROTOCOL_STATS + trans_proto_stats.update(TPE_DATA_PKT_SEND, buf->pkt); +#endif + + struct icpkthdr *pkt_ = buf->pkt; + pkt_->send_time = now; + pkt_->recv_time = 0; + pkt_->retry_times = buf->nRetry; + this->sendOnce(buf->pkt); + + ic_statistics.sndPktNum++; + +#ifdef AMS_VERBOSE_LOGGING + logPkt("SEND PKT DETAIL", buf->pkt); +#endif + + this->sentSeq = buf->pkt->seq; + } +} + +/* + * handleDisorderPacket + * Called by rx thread to assemble and send a disorder message. + * + * In current implementation, we limit the number of lost packet sequence numbers + * in the disorder message by the MIN_PACKET_SIZE. There are two reasons here: + * + * 1) The maximal number of lost packet sequence numbers are actually bounded by the + * receive queue depth whose maximal value is very large. Since we share the packet + * receive and ack receive in the background thread, the size of disorder should be + * also limited by the max packet size. + * 2) We can use Gp_max_packet_size here to limit the number of lost packet sequence numbers. + * But considering we do not want to let senders send many packets when getting a lost + * message. Here we use MIN_PACKET_SIZE. + * + * + * the format of a disorder message: + * I) pkt header + * - seq -> packet sequence number that triggers the disorder message + * - extraSeq -> the largest seq of the received packets + * - flags -> UDPIC_FLAGS_DISORDER + * - len -> sizeof(icpkthdr) + sizeof(uint32) * (lost pkt count) + * II) content + * - an array of lost pkt sequence numbers (uint32) + * + */ +void +UDPConn::handleDisorderPacket(int pos, uint32 tailSeq, icpkthdr *pkt) +{ + int start = 0; + uint32 lostPktCnt = 0; + uint32 *curSeq = (uint32 *) &rx_control_info.disorderBuffer[1]; + uint32 maxSeqs = MAX_SEQS_IN_DISORDER_ACK; + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "PROCESS_DISORDER PKT BEGIN:"); +#endif + + start = this->pkt_q_tail; + + while (start != pos && lostPktCnt < maxSeqs) + { + if (this->pkt_q[start] == NULL) + { + *curSeq = tailSeq; + lostPktCnt++; + curSeq++; + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "PROCESS_DISORDER add seq [%d], lostPktCnt %d", *curSeq, lostPktCnt); +#endif + } + + tailSeq++; + start = (start + 1) % this->pkt_q_capacity; + } + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "PROCESS_DISORDER PKT END:"); +#endif + + /* when reaching here, cnt must not be 0 */ + this->sendDisorderAck(pkt->seq, this->conn_info.seq - 1, lostPktCnt); +} + +/* + * handleAckForDisorderPkt + * Called by sender to deal with acks for disorder packet. + */ +bool +UDPConn::handleAckForDisorderPkt(icpkthdr *pkt) +{ + ICBufferLink *link = NULL; + ICBuffer *buf = NULL; + ICBufferLink *next = NULL; + uint64 now = getCurrentTime(); + uint32 *curLostPktSeq = 0; + int lostPktCnt = 0; + static uint32 times = 0; + static uint32 lastSeq = 0; + bool shouldSendBuffers = false; + + if (pkt->extraSeq != lastSeq) + { + lastSeq = pkt->extraSeq; + times = 0; + return false; + } + else + { + times++; + if (times != 2) + return false; + } + + curLostPktSeq = (uint32 *) &pkt[1]; + lostPktCnt = (pkt->len - sizeof(icpkthdr)) / sizeof(uint32); + + /* + * Resend all the missed packets and remove received packets from queues + */ + + link = this->unackQueue.first(); + buf = GET_ICBUFFER_FROM_PRIMARY(link); + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "DISORDER: pktlen %d cnt %d pktseq %d first loss %d buf %p", + pkt->len, lostPktCnt, pkt->seq, *curLostPktSeq, buf); + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + { + this->unackQueue.icBufferListLog(); + this->sndQueue.icBufferListLog(); + } +#endif + + /* + * iterate the unack queue + */ + while (!this->unackQueue.is_head(link) && buf->pkt->seq <= pkt->seq && lostPktCnt > 0) + { +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "DISORDER: bufseq %d curlostpkt %d cnt %d buf %p pkt->seq %d", + buf->pkt->seq, *curLostPktSeq, lostPktCnt, buf, pkt->seq); +#endif + + if (buf->pkt->seq == pkt->seq) + { + this->handleAckedPacket(buf, now, pkt); + shouldSendBuffers = true; + break; + } + + if (buf->pkt->seq == *curLostPktSeq) + { + /* this is a lost packet, retransmit */ + + buf->nRetry++; + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_IC || session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + { + ICBufferList *alist = &unack_queue_ring.slots[buf->unackQueueRingSlot]; + buf = alist->remove(buf); + putIntoUnackQueueRing(&unack_queue_ring, buf, + this->computeExpirationPeriod(buf->nRetry), now); + } +#ifdef TRANSFER_PROTOCOL_STATS + trans_proto_stats.update(TPE_DATA_PKT_SEND, buf->pkt); +#endif + + Assert(this == buf->conn); + this->sendOnce(buf->pkt); + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "RESEND a buffer for DISORDER: seq %d", buf->pkt->seq); + logPkt("DISORDER RESEND DETAIL ", buf->pkt); +#endif + + ic_statistics.retransmits++; + curLostPktSeq++; + lostPktCnt--; + + link = link->next; + buf = GET_ICBUFFER_FROM_PRIMARY(link); + } + else if (buf->pkt->seq < *curLostPktSeq) + { + /* remove packet already received. */ + + next = link->next; + this->handleAckedPacket(buf, now, pkt); + shouldSendBuffers = true; + link = next; + buf = GET_ICBUFFER_FROM_PRIMARY(link); + } + else /* buf->pkt->seq > *curPktSeq */ + { + /* + * this case is introduced when the disorder message tell you a + * pkt is lost. But when we handle this message, a message (for + * example, duplicate ack, or another disorder message) arriving + * before this message already removed the pkt. + */ + curLostPktSeq++; + lostPktCnt--; + } + } + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_IC || session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + { + snd_control_info.ssthresh = Max(snd_control_info.cwnd / 2, snd_control_info.minCwnd); + snd_control_info.cwnd = snd_control_info.ssthresh; + } +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "After DISORDER: sndQ %d unackQ %d", this->sndQueue.length(), this->unackQueue.length()); + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + { + this->unackQueue.icBufferListLog(); + this->sndQueue.icBufferListLog(); + } +#endif + + return shouldSendBuffers; +} + +/* + * handleAckForDuplicatePkt + * Called by sender to deal with acks for duplicate packet. + * + */ +bool +UDPConn::handleAckForDuplicatePkt(icpkthdr *pkt) +{ + ICBufferLink *link = NULL; + ICBuffer *buf = NULL; + ICBufferLink *next = NULL; + uint64 now = getCurrentTime(); + bool shouldSendBuffers = false; + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "RESEND the unacked buffers in the queue due to %s", pkt->len == 0 ? "PROCESS_START_RACE" : "DISORDER"); +#endif + + if (pkt->seq <= pkt->extraSeq) + { + /* Indicate a bug here. */ + LOG(LOG_ERROR, "invalid duplicate message: seq %d extraSeq %d", pkt->seq, pkt->extraSeq); + return false; + } + + link = this->unackQueue.first(); + buf = GET_ICBUFFER_FROM_PRIMARY(link); + + /* deal with continuous pkts */ + while (!this->unackQueue.is_head(link) && (buf->pkt->seq <= pkt->extraSeq)) + { + next = link->next; + this->handleAckedPacket(buf, now, pkt); + shouldSendBuffers = true; + link = next; + buf = GET_ICBUFFER_FROM_PRIMARY(link); + } + + /* deal with the single duplicate packet */ + while (!this->unackQueue.is_head(link) && buf->pkt->seq <= pkt->seq) + { + next = link->next; + if (buf->pkt->seq == pkt->seq) + { + this->handleAckedPacket(buf, now, pkt); + shouldSendBuffers = true; + break; + } + link = next; + buf = GET_ICBUFFER_FROM_PRIMARY(link); + } + + return shouldSendBuffers; +} + +/* + * checkNetworkTimeout + * check network timeout case. + */ +void +UDPConn::checkNetworkTimeout(ICBuffer *buf, uint64 now, bool *networkTimeoutIsLogged) +{ + /* + * Using only the time to first sent time to decide timeout is not enough, + * since there is a possibility the sender process is not scheduled or + * blocked by OS for a long time. In this case, only a few times are + * tried. Thus, the GUC Gp_interconnect_min_retries_before_timeout is + * added here. + */ + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC && + buf->nRetry % session_param.Gp_interconnect_debug_retry_interval == 0) + { + LOG(INFO, "resending packet (seq %d) to %s (pid %d cid %d) with %d retries in %lu seconds", + buf->pkt->seq, buf->conn->remoteHostAndPort, buf->pkt->dstPid, buf->pkt->dstContentId, buf->nRetry, + (now - buf->sentTime) / 1000 / 1000); + } + + if ((buf->nRetry > session_param.Gp_interconnect_min_retries_before_timeout) && + (now - buf->sentTime) > ((uint64) session_param.Gp_interconnect_transmit_timeout * 1000 * 1000)) + { + std::stringstream ss; + ss <<"ERROR, interconnect encountered a network error, please check your network"<< + "Failed to send packet (seq "<pkt->seq<<") to "<conn->remoteHostAndPort<< + " (pid "<pkt->dstPid<<" cid "<pkt->dstContentId<<") after "<nRetry<< + " retries in "<nRetry >= session_param.Gp_interconnect_min_retries_before_timeout) && !(*networkTimeoutIsLogged)) + { + LOG(WARNING, "interconnect may encountered a network error, please check your network" + "Failed to send packet (seq %d) to %s (pid %d cid %d) after %d retries.", + buf->pkt->seq, buf->conn->remoteHostAndPort, buf->pkt->dstPid, buf->pkt->dstContentId, buf->nRetry); + *networkTimeoutIsLogged = true; + } +} + +/* + * checkExpiration + * Check whether packets expire. If a packet expires, resend the packet, + * and adjust its position in the unack queue ring. + * + */ +void +UDPConn::checkExpiration(ICChunkTransportState *transportStates, uint64 now) +{ + /* check for expiration */ + int count = 0; + int retransmits = 0; + UDPConn *currBuffConn = NULL; + + Assert(unack_queue_ring.currentTime != 0); + + if (unlikely(session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER_IC)) + { + checkRtmTimeout(&mudp, now, 500, transportStates, this->entry_, this); + return; + } + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + { + uint64 timer_span_time = unack_queue_ring.currentTime + TIMER_SPAN_LOSS; + + while (now >= (timer_span_time + unack_queue_ring.time_difference) && count++ < UNACK_QUEUE_RING_SLOTS_NUM) + { + /* expired, need to resend them */ + ICBuffer *curBuf = NULL; + + while ((curBuf = unack_queue_ring.slots[unack_queue_ring.idx].pop()) != NULL) + { + UDPConn *conn = static_cast(curBuf->conn); + curBuf->nRetry++; + + /* + * Fixed Timeout Thresholds: Traditional TCP-style Retransmission Timeout + * (RTTVAR.RTO) calculations may be too rigid for networks with volatile + * latency. This leads to: + * Premature Retransmissions: Unnecessary data resends during temporary + * latency spikes, wasting bandwidth. + * Delayed Recovery: Slow reaction to actual packet loss when RTO is + * overly conservative. + * + * Lack of Context Awareness: Static RTO ignores real-time network behavior + * patterns, reducing throughput and responsiveness. + * + * Solution: Dynamic Timeout Threshold Adjustment + * Implements an adaptive timeout mechanism to optimize retransmission: + * if (now < (curBuf->sentTime + conn->rttvar.rto)) { + * uint32_t diff = (curBuf->sentTime + conn->rttvar.rto) - now; + * // ... (statistical tracking and threshold adjustment) + * } + * Temporary Latency Spike: Uses max (conservative) to avoid false + * retransmits, reducing bandwidth waste (vs. traditional mistaken + * retransmissions). + * Persistent Packet Loss: Prioritizes min (aggressive) via + * weight_retrans, accelerating recovery (vs. slow fixed-RTO reaction). + * Stable Network: Balances weights for equilibrium throughput (vs. + * static RTO limitations). + */ + if (now < (curBuf->sentTime + conn->rttvar.rto)) + { +#ifdef TIMEOUT_Z + uint32_t diff = (curBuf->sentTime + conn->rttvar.rto) - now; + if(unack_queue_ring.retrans_count == 0 && unack_queue_ring.no_retrans_count == 0) + { + unack_queue_ring.min = diff; + unack_queue_ring.max = diff; + } + + if (diff < unack_queue_ring.min) unack_queue_ring.min = diff; + if (diff > unack_queue_ring.max) unack_queue_ring.max = diff; + + if (unack_queue_ring.retrans_count == 0) + unack_queue_ring.time_difference = unack_queue_ring.max; + else if (unack_queue_ring.no_retrans_count == 0 && ic_statistics.retransmits < (session_param.Gp_interconnect_min_retries_before_timeout / 4)) + unack_queue_ring.time_difference = 0; + else + { + uint32_t total_count = unack_queue_ring.retrans_count + unack_queue_ring.no_retrans_count; + double weight_retrans = (double)unack_queue_ring.retrans_count / total_count; + double weight_no_retrans = (double)unack_queue_ring.no_retrans_count / total_count; + unack_queue_ring.time_difference = (uint32_t)(unack_queue_ring.max * weight_no_retrans + unack_queue_ring.min * weight_retrans); + } + + ++unack_queue_ring.no_retrans_count; + } + else + ++unack_queue_ring.retrans_count; +#endif + +#ifdef TRANSFER_PROTOCOL_STATS + trans_proto_stats.update(TPE_DATA_PKT_SEND, curBuf->pkt); +#endif + + currBuffConn = static_cast(curBuf->conn); + putIntoUnackQueueRing(&unack_queue_ring, + curBuf, + currBuffConn->computeExpirationPeriod(curBuf->nRetry), getCurrentTime()); + struct icpkthdr *pkt_ = curBuf->pkt; + + pkt_->send_time = getCurrentTime(); + pkt_->recv_time = 0; + pkt_->retry_times = curBuf->nRetry; + + currBuffConn->sendOnce(curBuf->pkt); + + /* + * Adaptive Retry Backoff with Polling for Network Asymmetry Mitigation + * + * This logic addresses two critical network pathologies: + * 1. RTO Distortion Amplification: + * - Packet loss in volatile networks causes RTO-based retransmission errors + * - Multiple spurious retries increase network load and congestion collapse risk + * 2. Data Skew-Induced Starvation: + * - Under unbalanced workloads, low-traffic nodes experience MON (Message Order Number) delays + * - Delayed ACKs trigger false retransmissions even when packets arrive eventually + * - Unacked queue inflation worsens congestion in high-traffic nodes + */ + int32_t loop_ack = curBuf->nRetry; + uint32_t rto_min = UDP_RTO_MIN / 10; + uint32_t rtoMs = conn->rttvar.rto / 1000; + int32_t wait_time = rto_min > rtoMs ? rto_min : rtoMs; + int32_t loop = 0; + + /* + * To optimize performance, we need to process all the time-out file descriptors (fds) + * in each batch together. + */ + if (loop_ack > 0) + { + while (loop++ < loop_ack) + { + if (this->entry_->pollAcks(wait_time)) + { + this->entry_->handleAcks(false); + curBuf->nRetry = 0; + break; + } + + struct icpkthdr *pkt_ = curBuf->pkt; + pkt_->send_time = getCurrentTime(); + pkt_->recv_time = 0; + pkt_->retry_times = curBuf->nRetry; + currBuffConn->sendOnce(pkt_); + + if (loop_ack < (session_param.Gp_interconnect_min_retries_before_timeout / 10)) + wait_time += wait_time / 10; + else if (loop_ack > (session_param.Gp_interconnect_min_retries_before_timeout / 10) && loop_ack < (session_param.Gp_interconnect_min_retries_before_timeout / 5)) + wait_time += RTO_MAX / 10; + else if (loop_ack > (session_param.Gp_interconnect_min_retries_before_timeout / 5) && loop_ack < (session_param.Gp_interconnect_min_retries_before_timeout / 2)) + wait_time += RTO_MAX / 5; + else if (loop_ack < (session_param.Gp_interconnect_min_retries_before_timeout)) + wait_time += RTO_MAX; + }; + } + + if (loop_ack > session_param.Gp_interconnect_min_retries_before_timeout / 5) + LOG(INFO, "Resending packet (seq %d) to %s (pid %d cid %d) with %d retries in %lu seconds", + curBuf->pkt->seq, curBuf->conn->remoteHostAndPort, + curBuf->pkt->dstPid, curBuf->pkt->dstContentId, curBuf->nRetry, + (now - curBuf->sentTime) / 1000 / 1000); + + retransmits++; + ic_statistics.retransmits++; + currBuffConn->stat_count_resent++; + currBuffConn->stat_max_resent = Max(currBuffConn->stat_max_resent, + currBuffConn->stat_count_resent); + + UDPConn::checkNetworkTimeout(curBuf, now, &transportStates->networkTimeoutIsLogged); + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "RESEND pkt with seq %d (retry %d, rtt " UINT64_FORMAT ") to route %d", + curBuf->pkt->seq, curBuf->nRetry, currBuffConn->rtt, currBuffConn->route); + logPkt("RESEND PKT in checkExpiration", curBuf->pkt); +#endif + } + + timer_span_time += TIMER_SPAN_LOSS; + unack_queue_ring.idx = (unack_queue_ring.idx + 1) % (UNACK_QUEUE_RING_SLOTS_NUM); + } + } + else + { + while (now >= (unack_queue_ring.currentTime + TIMER_SPAN) && count++ < UNACK_QUEUE_RING_SLOTS_NUM) + { + /* expired, need to resend them */ + ICBuffer *curBuf = NULL; + + while ((curBuf = unack_queue_ring.slots[unack_queue_ring.idx].pop()) != NULL) + { + curBuf->nRetry++; + currBuffConn = static_cast(curBuf->conn); + putIntoUnackQueueRing( + &unack_queue_ring, + curBuf, + currBuffConn->computeExpirationPeriod(curBuf->nRetry), now); + +#ifdef TRANSFER_PROTOCOL_STATS + trans_proto_stats.update(TPE_DATA_PKT_SEND, curBuf->pkt); +#endif + + currBuffConn->sendOnce(curBuf->pkt); + + retransmits++; + ic_statistics.retransmits++; + currBuffConn->stat_count_resent++; + currBuffConn->stat_max_resent = Max(currBuffConn->stat_max_resent, currBuffConn->stat_count_resent); + UDPConn::checkNetworkTimeout(curBuf, now, &transportStates->networkTimeoutIsLogged); + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "RESEND pkt with seq %d (retry %d, rtt " UINT64_FORMAT ") to route %d", + curBuf->pkt->seq, curBuf->nRetry, curBuf->conn->rtt, curBuf->conn->route); + logPkt("RESEND PKT in checkExpiration", curBuf->pkt); +#endif + } + + unack_queue_ring.currentTime += TIMER_SPAN; + unack_queue_ring.idx = (unack_queue_ring.idx + 1) % (UNACK_QUEUE_RING_SLOTS_NUM); + } + + /* + * deal with case when there is a long time this function is not called. + */ + unack_queue_ring.currentTime = now - (now % (TIMER_SPAN)); + } + + if (retransmits > 0) + { + snd_control_info.ssthresh = Max(snd_control_info.cwnd / 2, snd_control_info.minCwnd); + snd_control_info.cwnd = snd_control_info.minCwnd; + } +} + +/* + * checkDeadlock + * Check whether deadlock occurs on a connection. + * + * What this function does is to send a status query message to rx thread when + * the connection has not received any acks for some time. This is to avoid + * potential deadlock when there are continuous ack losses. Packet resending + * logic does not help avoiding deadlock here since the packets in the unack + * queue may already been removed when the sender knows that they have been + * already buffered in the receiver side queue. + * + * Some considerations on deadlock check time period: + * + * Potential deadlock occurs rarely. According to our experiments on various + * workloads and hardware. It occurred only when fault injection is enabled + * and a large number packets and acknowledgments are discarded. Thus, here we + * use a relatively large deadlock check period. + * + */ +void +UDPConn::checkDeadlock() +{ + uint64 deadlockCheckTime; + + if (this->unackQueue.length() == 0 && this->capacity == 0 && this->sndQueue.length() > 0) + { + /* we must have received some acks before deadlock occurs. */ + Assert(this->deadlockCheckBeginTime > 0); + +#ifdef USE_ASSERT_CHECKING + if (udp_testmode) + { + deadlockCheckTime = 100000; + } + else +#endif + { + deadlockCheckTime = DEADLOCK_CHECKING_TIME; + } + + uint64 now = getCurrentTime(); + + /* request the capacity to avoid the deadlock case */ + if (((now - ic_control_info.lastDeadlockCheckTime) > deadlockCheckTime) && + ((now - this->deadlockCheckBeginTime) > deadlockCheckTime)) + { + this->sendStatusQueryMessage(this->conn_info.seq - 1); + ic_control_info.lastDeadlockCheckTime = now; + ic_statistics.statusQueryMsgNum++; + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC && this->entry_->pollAcks(50)) + { + this->entry_->handleAcks(false); + this->deadlockCheckBeginTime = now; + } + + /* check network error. */ + if ((now - this->deadlockCheckBeginTime) > ((uint64) session_param.Gp_interconnect_transmit_timeout * 100 * 1000)) + { + LOG(INFO, "Did not get any response from %s (pid %d cid %d) in 600 seconds.", this->remoteHostAndPort, + this->conn_info.dstPid, this->conn_info.dstContentId); + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER_IC) + this->capacity += 1; + + if ((now - this->deadlockCheckBeginTime) > ((uint64)session_param.Gp_interconnect_transmit_timeout * 1000 * 1000)) + { + std::stringstream ss; + ss << "ERROR, interconnect encountered a network error, please check your network." + << "Did not get any response from " << remoteHostAndPort << " (pid " << conn_info.dstPid << " cid " << conn_info.dstContentId << ") in " + << session_param.Gp_interconnect_transmit_timeout << " seconds."; + throw ICNetworkException(ss.str(), __FILE__, __LINE__); + } + } + } + } +} + +/* + * updateRetransmitStatistics + * Update the retransmit statistics. + */ +void +UDPConn::updateRetransmitStatistics() +{ + ic_statistics.retransmits++; + this->stat_count_resent++; + this->stat_max_resent = Max(this->stat_max_resent, this->stat_count_resent); +} + +/* + * checkExpirationCapacityFC + * Check expiration for capacity based flow control method. + */ +void +UDPConn::checkExpirationCapacityFC(int timeout) +{ + if (this->unackQueue.length() == 0) + return; + + uint64 now = getCurrentTime(); + uint64 elapsed = now - ic_control_info.lastPacketSendTime; + + if (elapsed >= ((uint64) timeout * 1000)) + { + ICBufferLink *bufLink = this->unackQueue.first(); + ICBuffer *buf = GET_ICBUFFER_FROM_PRIMARY(bufLink); + + Assert(this == buf->conn); + this->sendOnce(buf->pkt); + + buf->nRetry++; + ic_control_info.lastPacketSendTime = now; + + this->updateRetransmitStatistics(); + UDPConn::checkNetworkTimeout(buf, now, &entry_->state->networkTimeoutIsLogged); + } +} + +/* + * checkExceptions + * Check exceptions including packet expiration, deadlock, bg thread error, NIC failure... + * Caller should start from 0 with retry, so that the expensive check for deadlock and + * QD connection can be avoided in a healthy state. + */ +void +UDPConn::checkExceptions(int retry, int timeout) +{ + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_CAPACITY_IC + /* || conn->state == mcsSetupOutgoingConnection */ ) + { + this->checkExpirationCapacityFC(timeout); + } + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_IC || session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + { + uint64 now = getCurrentTime(); + + if (now - ic_control_info.lastExpirationCheckTime > uint64(TIMER_CHECKING_PERIOD)) + { + this->checkExpiration(this->entry_->state, now); + ic_control_info.lastExpirationCheckTime = now; + } + } + + if ((retry & 0x3) == 2) + { + this->checkDeadlock(); + + checkRxThreadError(); + CHECK_INTERRUPTS(this->entry_->state); + } + + /* + * 1. NIC on master (and thus the QD connection) may become bad, check it. + * 2. Postmaster may become invalid, check it + * + * We check modulo 2 to correlate with the deadlock check above at the + * initial iteration. + */ + if ((retry & 0x3f) == 2) + { + checkQDConnectionAlive(); + CHECK_POSTMASTER_ALIVE(); + } +} + +/* + * computeTimeout + * Compute timeout value in ms. + */ +int +UDPConn::computeTimeout(int retry) +{ + int32_t rtoMs = 0; + + rtoMs = this->rttvar.rto / 1000; + if (this->unackQueue.length() == 0) + return TIMER_CHECKING_PERIOD; + + ICBufferLink *bufLink = this->unackQueue.first(); + ICBuffer *buf = GET_ICBUFFER_FROM_PRIMARY(bufLink); + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_ADVANCE_IC) + { + if (buf->nRetry == 0 && retry == 0 && unack_queue_ring.numSharedOutStanding < (snd_control_info.cwnd - snd_control_info.minCwnd)) + return 0; + + return rtoMs > TIMER_CHECKING_PERIOD ? rtoMs: TIMER_CHECKING_PERIOD; + } + + if (buf->nRetry == 0 && retry == 0) + return 0; + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_IC) + return TIMER_CHECKING_PERIOD; + + /* for capacity based flow control */ + return TIMEOUT(buf->nRetry); +} + +/* + * UDPConn::Send + * is used to send a tcItem to a single destination. Tuples often are + * *very small* we aggregate in our local buffer before sending into the kernel. + * + * PARAMETERS + * conn - UDPConn that the tcItem is to be sent to. + * tcItem - message to be sent. + * motionId - Node Motion Id. + */ +void +UDPConn::Send(DataBlock *data) +{ + int length = data->len; + int retry = 0; + bool doCheckExpiration = false; + bool gotStops = false; + + Assert(this->msgSize > 0); + +#ifdef AMS_VERBOSE_LOGGING + LOG(DEBUG3, "UDPConn::Send(): msgSize %d this chunk length %d this seq %d", + this->msgSize, data->len, this->conn_info.seq); +#endif + + if (this->msgSize + length <= global_param.Gp_max_packet_size) + { + memcpy(this->pBuff + this->msgSize, data->pos, data->len); + this->msgSize += length; + + this->tupleCount++; + return; + } + + /* prepare this for transmit */ + ic_statistics.totalCapacity += this->capacity; + ic_statistics.capacityCountingTime++; + + /* try to send it */ + this->prepareXmit(); + this->sndQueue.append(this->curBuff); + this->sendBuffers(); + + /* get a new buffer */ + this->curBuff = NULL; + this->pBuff = NULL; + + uint64 now = getCurrentTime(); + + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_CAPACITY_IC) + doCheckExpiration = false; + else + doCheckExpiration = (now - ic_control_info.lastExpirationCheckTime) > MAX_TIME_NO_TIMER_CHECKING ? true : false; + + ic_control_info.lastPacketSendTime = 0; + this->deadlockCheckBeginTime = now; + + while (doCheckExpiration || (this->curBuff = snd_buffer_pool.get(this)) == NULL) + { + int timeout = (doCheckExpiration ? 0 : this->computeTimeout(retry)); + + if (this->entry_->pollAcks(timeout)) + { + if (this->entry_->handleAcks(true)) + { + /* + * We make sure that we deal with the stop messages only after + * we get a buffer. Otherwise, if the stop message is not for + * this connection, this will lead to an error for the + * following data sending of this connection. + */ + gotStops = true; + } + } + this->checkExceptions(retry++, timeout); + doCheckExpiration = false; + + if (!doCheckExpiration && this->unackQueue.length() == 0 && this->capacity > 0 && this->sndQueue.length() > 0) + this->sendBuffers(); + } + + this->pBuff = (uint8 *) this->curBuff->pkt; + + if (gotStops) + { + /* handling stop message will make some connection not active anymore */ + this->entry_->handleStopMsgs(); + + if (!this->stillActive) + return; + } + + /* reinitialize connection */ + this->tupleCount = 0; + this->msgSize = sizeof(this->conn_info); + + /* now we can copy the input to the new buffer */ + memcpy(this->pBuff + this->msgSize, data->pos, data->len); + this->msgSize += length; + + this->tupleCount++; +} + +/* + * C++ implement for udp protocol. + */ +UDPConn::UDPConn(TransportEntry *entry) +{ + /* the field of MotionConn */ + this->sockfd = -1; + this->pBuff = nullptr; + this->msgSize = 0; + this->msgPos = nullptr; + this->recvBytes = 0; + this->tupleCount = 0; + this->stillActive = false; + this->stopRequested = false; + this->cdbProc = nullptr; + this->remoteContentId = -1; + this->remoteHostAndPort[0] = '\0'; + this->opaque_data = nullptr; + this->sent_record_typmod = 0; + + /* the field of UDPConn */ + this->capacity = -1; + this->sentSeq = 0; + this->receivedAckSeq = 0; + this->consumedSeq = 0; + this->rtt = 0; + this->dev = 0; + this->deadlockCheckBeginTime = -1; + this->curBuff = nullptr; + this->route = 0; + this->peer_len = 0; + this->pkt_q_capacity = 0; + this->pkt_q_size = 0; + this->pkt_q_head = -1; + this->pkt_q_tail = -1; + this->pkt_q = nullptr; + this->stat_total_ack_time = 0; + this->stat_count_acks = 0; + this->stat_max_ack_time = 0; + this->stat_min_ack_time = 0; + this->stat_count_resent = 0; + this->stat_max_resent = 0; + this->stat_count_dropped = 0; + + this->state = mcsNull; + this->sockfd = -1; + this->msgSize = 0; + this->tupleCount = 0; + this->stillActive = false; + this->stopRequested = false; + this->cdbProc = NULL; + this->opaque_data = NULL; + this->sent_record_typmod = 0; + + /* + * "UDPConn dummyconn(NULL)" will be called by handleMismatch() in rx thread, + * it will lead to the error: "palloc called from thread". So code below should + * be called in MakeSendEntry() and MakeRecvEntry(); + * if (global_param.createOpaqueDataCallback) + * this->opaque_data = global_param.createOpaqueDataCallback(); + */ + + this->entry_ = entry; +} + +UDPConn* +TransportEntry::GetConn(int index) +{ + Assert(index >= 0); + + if (index >= 0 && static_cast(index) < this->conns_.size()) + return this->conns_[index].get(); + + std::stringstream ss; + ss << "invalid index for conn, index: " << index << ", conn size: " << conns_.size(); + throw ICInvalidIndex(ss.str(), __FILE__, __LINE__); +} + +/* + * aggregateStatistics + * aggregate statistics. + */ +void +TransportEntry::aggregateStatistics() +{ + /* + * We first clear the stats, and then compute new stats by aggregating the + * stats from each connection. + */ + this->stat_total_ack_time = 0; + this->stat_count_acks = 0; + this->stat_max_ack_time = 0; + this->stat_min_ack_time = ~((uint64) 0); + this->stat_count_resent = 0; + this->stat_max_resent = 0; + this->stat_count_dropped = 0; + + Assert(this->numConns == static_cast(this->conns_.size())); + for (int connNo = 0; connNo < this->numConns; connNo++) + { + UDPConn *conn = this->GetConn(connNo); + + this->stat_total_ack_time += conn->stat_total_ack_time; + this->stat_count_acks += conn->stat_count_acks; + this->stat_max_ack_time = Max(this->stat_max_ack_time, conn->stat_max_ack_time); + this->stat_min_ack_time = Min(this->stat_min_ack_time, conn->stat_min_ack_time); + this->stat_count_resent += conn->stat_count_resent; + this->stat_max_resent = Max(this->stat_max_resent, conn->stat_max_resent); + this->stat_count_dropped += conn->stat_count_dropped; + } +} + +/* + * handleAck + * handle acks incoming from our upstream peers. + * + * if we receive a stop message, return true (caller will clean up). + */ +bool +TransportEntry::handleAcks(bool need_flush) +{ + bool ret = false; + UDPConn *ackConn = NULL; + int n; + + struct sockaddr_storage peer; + socklen_t peerlen; + + struct icpkthdr *pkt = snd_control_info.ackBuffer; + bool shouldSendBuffers = false; + + for (;;) + { + + /* ready to read on our socket ? */ + peerlen = sizeof(peer); + n = recvfrom(this->txfd, (char *) pkt, MIN_PACKET_SIZE, 0, + (struct sockaddr *) &peer, &peerlen); + + if (n < 0) + { + if (errno == EWOULDBLOCK) /* had nothing to read. */ + { + this->aggregateStatistics(); + return ret; + } + + CHECK_INTERRUPTS(this->state); + + if (errno == EINTR) + continue; + + throw ICNetworkException("ERROR, interconnect error waiting for peer ack, During recvfrom() call.", __FILE__, __LINE__); + } + else if (n < int(sizeof(struct icpkthdr))) + { + continue; + } + else if (n != int(pkt->len)) + { + continue; + } + + /* + * check the CRC of the payload. + */ + if (session_param.gp_interconnect_full_crc) + { + if (!checkCRC(pkt)) + { + ic_atomic_add_fetch_u32((ic_atomic_uint32 *) &ic_statistics.crcErrors, 1); + if (IC_DEBUG2 >= session_param.log_min_messages) + LOG(DEBUG2, "received network data error, dropping bad packet, user data unaffected."); + continue; + } + } + + /* + * read packet, is this the ack we want ? + */ + if (pkt->srcContentId == global_param.segindex && + pkt->srcPid == global_param.MyProcPid && + pkt->srcListenerPort == (UDP2_GetListenPortUDP()) && + pkt->sessionId == session_param.gp_session_id && + pkt->icId == this->state->icInstanceId) + { + Assert(pkt->motNodeId == motNodeId); + LOG(DEBUG3, "TransportEntry::handleAcks(): icid: %d, motNodeId: %d, srcSeg: %d, dstSeg: %d, srcPid: %d, dstPid: %d, seq: %d, extraSeq: %d, len: %d, flags: %s", + pkt->icId, pkt->motNodeId, pkt->srcContentId, pkt->dstContentId, pkt->srcPid, pkt->dstPid, pkt->seq, pkt->extraSeq, pkt->len, flags2txt(pkt->flags)); + + /* + * packet is for me. Note here we do not need to get a connection + * lock here, since background rx thread only read the hash table. + */ + ackConn = ic_control_info.connHtab.find(pkt); + if (ackConn == NULL) + { + LOG(INFO, "Received ack for unknown connection (flags 0x%x)", pkt->flags); + continue; + } + + ackConn->stat_count_acks++; + ic_statistics.recvAckNum++; + + uint64 now = getCurrentTime(); + + ackConn->deadlockCheckBeginTime = now; + + /* + * We simply disregard pkt losses (NAK) due to process start race + * (that is, sender is started earlier than receiver. rx + * background thread may receive packets when connections are not + * created yet). + * + * Another option is to resend the packet immediately, but + * experiments do not show any benefits. + */ + if (pkt->flags & UDPIC_FLAGS_NAK) + continue; + + while (true) + { + if (pkt->flags & UDPIC_FLAGS_CAPACITY) + { + if (pkt->extraSeq > ackConn->consumedSeq) + { + ackConn->capacity += pkt->extraSeq - ackConn->consumedSeq; + ackConn->consumedSeq = pkt->extraSeq; + shouldSendBuffers = true; + } + } + else if (pkt->flags & UDPIC_FLAGS_DUPLICATE) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "GOTDUPACK [seq %d] from route %d; srcpid %d dstpid %d cmd %d flags 0x%x connseq %d", + pkt->seq, ackConn->route, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->flags, ackConn->conn_info.seq); + + shouldSendBuffers |= (ackConn->handleAckForDuplicatePkt(pkt)); + break; + } + else if (pkt->flags & UDPIC_FLAGS_DISORDER) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "GOTDISORDER [seq %d] from route %d; srcpid %d dstpid %d cmd %d flags 0x%x connseq %d", + pkt->seq, ackConn->route, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->flags, ackConn->conn_info.seq); + + shouldSendBuffers |= (ackConn->handleAckForDisorderPkt(pkt)); + break; + } + else if (pkt->flags & UDPIC_FLAGS_FULL) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "Recv buff is full [seq %d] from route %d; srcpid %d dstpid %d cmd %d flags 0x%x connseq %d", pkt->seq, ackConn->route, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->flags, ackConn->conn_info.seq); + break; + } + + /* + * don't get out of the loop if pkt->seq equals to + * ackConn->receivedAckSeq, need to check UDPIC_FLAGS_STOP + * flag + */ + if (pkt->seq < ackConn->receivedAckSeq) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "ack with bad seq?! expected (%d, %d] got %d flags 0x%x, capacity %d consumedSeq %d", + ackConn->receivedAckSeq, ackConn->sentSeq, pkt->seq, pkt->flags, ackConn->capacity, ackConn->consumedSeq); + break; + } + + /* haven't gotten a stop request, maybe this is one ? */ + if ((pkt->flags & UDPIC_FLAGS_STOP) && !ackConn->stopRequested && ackConn->stillActive) + { +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "got ack with stop; srcpid %d dstpid %d cmd %d flags 0x%x pktseq %d connseq %d", + pkt->srcPid, pkt->dstPid, pkt->icId, pkt->flags, pkt->seq, ackConn->conn_info.seq); +#endif + ackConn->stopRequested = true; + ackConn->conn_info.flags |= UDPIC_FLAGS_STOP; + ret = true; + /* continue to deal with acks */ + } + + if (pkt->seq == ackConn->receivedAckSeq) + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "ack with bad seq?! expected (%d, %d] got %d flags 0x%x, capacity %d consumedSeq %d", + ackConn->receivedAckSeq, ackConn->sentSeq, pkt->seq, pkt->flags, ackConn->capacity, ackConn->consumedSeq); + break; + } + + /* deal with a regular ack. */ + if (pkt->flags & UDPIC_FLAGS_ACK) + { + ICBufferLink *link = NULL; + ICBufferLink *next = NULL; + ICBuffer *buf = NULL; + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "GOTACK [seq %d] from route %d; srcpid %d dstpid %d cmd %d flags 0x%x connseq %d", + pkt->seq, ackConn->route, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->flags, ackConn->conn_info.seq); +#endif + + link = ackConn->unackQueue.first(); + buf = GET_ICBUFFER_FROM_PRIMARY(link); + + while (!ackConn->unackQueue.is_head(link) && buf->pkt->seq <= pkt->seq) + { + next = link->next; + ackConn->handleAckedPacket(buf, now, pkt); + shouldSendBuffers = true; + link = next; + buf = GET_ICBUFFER_FROM_PRIMARY(link); + } + } + break; + } + + /* + * When there is a capacity increase or some outstanding buffers + * removed from the unack queue ring, we should try to send + * buffers for the connection. Even when stop is received, we + * still send here, since in STOP/EOS race case, we may have been + * in EOS sending logic and will not check stop message. + */ + if (shouldSendBuffers && need_flush) + ackConn->sendBuffers(); + } + else + { + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "handleAck: not the ack we're looking for (flags 0x%x)...mot(%d) content(%d:%d) srcpid(%d:%d) " + "dstpid(%d) srcport(%d:%d) dstport(%d) sess(%d:%d) cmd(%d:%d)", + pkt->flags, pkt->motNodeId, pkt->srcContentId, global_param.segindex, + pkt->srcPid, global_param.MyProcPid, pkt->dstPid, pkt->srcListenerPort, + (UDP2_GetListenPortUDP()), pkt->dstListenerPort, pkt->sessionId, session_param.gp_session_id, + pkt->icId, this->state->icInstanceId); + } + } + + return ret; +} + +/* + * handleStopMsgs + * handle stop messages. + * + */ +void +TransportEntry::handleStopMsgs() +{ + int i = 0; + +#ifdef AMS_VERBOSE_LOGGING + LOG(DEBUG3, "handleStopMsgs: node %d", this->motNodeId); +#endif + + while (i < this->numConns) + { + UDPConn *conn = this->GetConn(i); + +#ifdef AMS_VERBOSE_LOGGING + LOG(DEBUG3, "handleStopMsgs: node %d route %d %s %s", this->motNodeId, conn->route, + (conn->stillActive ? "active" : "NOT active"), (conn->stopRequested ? "stop requested" : "")); + LOG(DEBUG3, "handleStopMsgs: node %d route %d msgSize %d", + this->motNodeId, conn->route, conn->msgSize); +#endif + + /* + * MPP-2427: we're guaranteed to have recently flushed, but this might + * not be empty (if we got a stop on a buffer that wasn't the one we + * were sending) ... empty it first so the outbound buffer is empty + * when we get here. + */ + conn->handleStop(); + + i++; + + if (i == this->numConns) + { + if (this->pollAcks(0)) + { + bool rs = this->handleAcks(true); + if (rs) + { + /* more stops found, loop again. */ + i = 0; + continue; + } + } + } + } +} + +/* + * pollAcks + * Timeout polling of acks + */ +bool +TransportEntry::pollAcks(int timeout) +{ + struct pollfd nfd; + int n; + + nfd.fd = this->txfd; + nfd.events = POLLIN; + + n = poll(&nfd, 1, timeout); + if (n < 0) + { + CHECK_INTERRUPTS(this->state); + + if (errno == EINTR) + return false; + + throw ICNetworkException("ERROR, interconnect error waiting for peer ack During poll() call.", __FILE__, __LINE__); + /* not reached */ + } + + if (n == 0) /* timeout */ + { + return false; + } + + /* got an ack to handle (possibly a stop message) */ + if (n == 1 && (nfd.events & POLLIN)) + { + return true; + } + + return false; +} + +std::unique_ptr +TransportEntry::MakeRecvEntry(CChunkTransportStateImpl *state, + int icid, + ICExecSlice *sendSlice, + ICExecSlice *recvSlice) +{ + int incoming_count = 0; + int expectedTotalIncoming = 0; + + Assert(sendSlice->sliceIndex > 0); + Assert(recvSlice->sliceIndex >= 0); + + int motNodeID = sendSlice->sliceIndex; + int numConns = sendSlice->numPrimaryProcesses; + + std::unique_ptr pEntry = + std::make_unique(state, motNodeID, numConns, sendSlice, recvSlice); + + pEntry->conns_.resize(numConns); + for (int i = 0; i < numConns; ++i) + { + pEntry->conns_[i] = std::make_unique(pEntry.get()); + UDPConn *conn = pEntry->conns_[i].get(); + + Assert(i < sendSlice->numPrimaryProcesses); + ICCdbProcess *cdbProc = sendSlice->primaryProcesses + i; + if (cdbProc->valid) + { + conn->cdbProc = cdbProc; + + expectedTotalIncoming++; + + /* rx_buffer_queue */ + conn->pkt_q_capacity = session_param.Gp_interconnect_queue_depth; + conn->pkt_q_size = 0; + conn->pkt_q_head = 0; + conn->pkt_q_tail = 0; + + if (global_param.simpleFaultInjectorCallback) + global_param.simpleFaultInjectorCallback("interconnect_setup_palloc"); + + conn->pkt_q = (uint8 **) ic_malloc0(conn->pkt_q_capacity * sizeof(uint8 *)); + + /* update the max buffer count of our rx buffer pool. */ + rx_buffer_pool.maxCount += conn->pkt_q_capacity; + + /* + * connection header info (defining characteristics of this + * connection) + */ + memset(&conn->conn_info, 0, sizeof(conn->conn_info)); + conn->route = i; + + conn->conn_info.seq = 1; + conn->stillActive = true; + + incoming_count++; + + conn->conn_info.motNodeId = pEntry->motNodeId; + conn->conn_info.recvSliceIndex = recvSlice->sliceIndex; + conn->conn_info.sendSliceIndex = sendSlice->sliceIndex; + + conn->conn_info.srcContentId = conn->cdbProc->contentid; + conn->conn_info.dstContentId = global_param.segindex; + + conn->conn_info.srcListenerPort = conn->cdbProc->listenerPort; + conn->conn_info.dstListenerPort = UDP2_GetListenPortUDP(); + conn->conn_info.srcPid = conn->cdbProc->pid; + conn->conn_info.dstPid = global_param.MyProcPid; + conn->conn_info.sessionId = session_param.gp_session_id; + conn->conn_info.icId = icid; + conn->conn_info.flags = UDPIC_FLAGS_RECEIVER_TO_SENDER; + + conn->rttvar.ts_rto = 0; + conn->rttvar.rto = UDP_INITIAL_RTO; + conn->rttvar.srtt = 0; + conn->rttvar.rttvar = 0; + conn->rttvar.snd_una = 0; + conn->rttvar.nrtx = 0; + conn->rttvar.max_nrtx = 0; + conn->rttvar.mss = UDP_DEFAULT_MSS; + conn->rttvar.cwnd = 2; + conn->rttvar.ssthresh = UDP_INFINITE_SSTHRESH; + conn->rttvar.loss_count = 0; + conn->rttvar.karn_mode = false; + conn->on_rto_idx = -1; + ic_control_info.connHtab.add(conn); + + if (global_param.createOpaqueDataCallback) + conn->opaque_data = global_param.createOpaqueDataCallback(); + } + } + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + { + LOG(DEBUG1, "SetupUDPInterconnect will activate " + "%d incoming, %d expect incoming for ic_instancce_id %d.", + incoming_count, expectedTotalIncoming, icid); + } + + return pEntry; +} + +std::unique_ptr +TransportEntry::MakeSendEntry(CChunkTransportStateImpl *state, + int icid, + ICExecSlice *sendSlice, + ICExecSlice *recvSlice) +{ + int outgoing_count = 0; + int expectedTotalOutgoing = 0; + + Assert(sendSlice->sliceIndex > 0); + Assert(recvSlice->sliceIndex >= 0); + + int motNodeID = sendSlice->sliceIndex; + int numConns = recvSlice->numPrimaryProcesses; + + std::unique_ptr pEntry = + std::make_unique(state, motNodeID, numConns, sendSlice, recvSlice); + + pEntry->txfd = ICSenderSocket; + pEntry->txport = ICSenderPort; + pEntry->txfd_family = ICSenderFamily; + + int route = 0; + pEntry->conns_.resize(numConns); + + for (int i = 0; i < numConns; ++i) + { + pEntry->conns_[i] = std::make_unique(pEntry.get()); + UDPConn *conn = pEntry->conns_[i].get(); + + /* + * Setup a MotionConn entry for each of our outbound connections. Request + * a connection to each receiving backend's listening port. NB: Some + * mirrors could be down & have no CdbProcess entry. + */ + ICCdbProcess *cdbProc = recvSlice->primaryProcesses + i; + if (cdbProc->valid) + { + conn->cdbProc = cdbProc; + conn->sndQueue.init(ICBufferListType_Primary); + conn->unackQueue.init(ICBufferListType_Primary); + conn->capacity = session_param.Gp_interconnect_queue_depth; + + /* send buffer pool must be initialized before this. */ + snd_buffer_pool.maxCount += session_param.Gp_interconnect_snd_queue_depth; + snd_control_info.cwnd += 1; + conn->curBuff = snd_buffer_pool.get(conn); + + /* should have at least one buffer for each connection */ + Assert(conn->curBuff != NULL); + + conn->rtt = DEFAULT_RTT; + conn->dev = DEFAULT_DEV; + conn->deadlockCheckBeginTime = 0; + conn->tupleCount = 0; + conn->msgSize = sizeof(conn->conn_info); + conn->sentSeq = 0; + conn->receivedAckSeq = 0; + conn->consumedSeq = 0; + conn->pBuff = (uint8 *) conn->curBuff->pkt; + conn->state = mcsSetupOutgoingConnection; + conn->route = route++; + expectedTotalOutgoing++; + + setupOutgoingUDPConnection(icid, pEntry.get(), conn); + outgoing_count++; + + if (global_param.createOpaqueDataCallback) + conn->opaque_data = global_param.createOpaqueDataCallback(); + } + } + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + { + LOG(DEBUG1, "SetupUDPInterconnect will activate " + "%d outgoing, %d expect outgoing routes for ic_instancce_id %d.", + outgoing_count, expectedTotalOutgoing, icid); + } + + return pEntry; +} + +TransportEntry::TransportEntry(CChunkTransportStateImpl *state, + int motNodeID, + int numConns, + ICExecSlice *sendSlice, + ICExecSlice *recvSlice) +{ + /* the field of CChunkTransportStateEntry */ + this->valid = false; + this->conns = nullptr; + + /* the field of TransportEntry */ + this->txfd = -1; + this->txfd_family = -1; + this->txport = 0; + this->sendingEos = false; + this->stat_total_ack_time = 0; + this->stat_count_acks = 0; + this->stat_max_ack_time = 0; + this->stat_min_ack_time = 0; + this->stat_count_resent = 0; + this->stat_max_resent = 0; + this->stat_count_dropped = 0; + + this->motNodeId = motNodeID; + this->numConns = numConns; + this->scanStart = 0; + this->sendSlice = sendSlice; + this->recvSlice = recvSlice; + this->state = state; + this->valid = true; +} + +/* + * receiveChunksUDPIFC + * Receive chunks from the senders + * + * MUST BE CALLED WITH ic_control_info.lock LOCKED. + */ +void +TransportEntry::receiveChunksUDPIFC(int16 *srcRoute, + UDPConn *conn, + GetDataLenInPacket getLen, + DataBlock *data) +{ + int retries = 0; + bool directed = false; + +#ifdef AMS_VERBOSE_LOGGING + LOG(DEBUG5, "receivechunksUDP: motnodeid %d", this->motNodeId); +#endif + + if (conn != nullptr) + { + directed = true; + *srcRoute = conn->route; + rx_control_info.mainWaitingState.set(this->motNodeId, conn->route, this->state->icInstanceId); + } + else + { + /* non-directed receive */ + rx_control_info.mainWaitingState.set(this->motNodeId, ANY_ROUTE, this->state->icInstanceId); + } + + std::unique_lock lock(mtx); + auto timeout = std::chrono::milliseconds(MAIN_THREAD_COND_TIMEOUT_MS); + + /* we didn't have any data, so we've got to read it from the network. */ + for (;;) + { + UDPConn *rxconn = nullptr; + + /* 1. Do we have data ready */ + if (rx_control_info.mainWaitingState.reachRoute != ANY_ROUTE) + { + rxconn = this->GetConn(rx_control_info.mainWaitingState.reachRoute); + rxconn->prepareRxConnForRead(); + + LOG(DEBUG2, "receiveChunksUDPIFC: non-directed rx woke on route %d", rx_control_info.mainWaitingState.reachRoute); + rx_control_info.mainWaitingState.reset(); + } + + this->aggregateStatistics(); + if (rxconn) + { + Assert(rxconn->pBuff); + + pthread_mutex_unlock(&ic_control_info.lock); + + LOG(DEBUG2, "got data with length %d", rxconn->recvBytes); + /* successfully read into this connection's buffer. */ + rxconn->GetDataInBuf(getLen, data); + + if (!directed) + *srcRoute = rxconn->route; + + return; + } + + retries++; + + /* + * Ok, we've processed all the items currently in the queue. Arm the + * latch (before releasing the mutex), and wait for more messages to + * arrive. The RX thread will wake us up using the latch. + */ + pthread_mutex_unlock(&ic_control_info.lock); + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + { + LOG(DEBUG5, "waiting (timed) on route %d %s", rx_control_info.mainWaitingState.waitingRoute, + (rx_control_info.mainWaitingState.waitingRoute == ANY_ROUTE ? "(any route)" : "")); + } + + /* + * Wait for data to become ready. + * + * In the QD, also wake up immediately if any QE reports an + * error through the main QD-QE libpq connection. For that, ask + * the dispatcher for a file descriptor to wait on for that. + */ + cv.wait_for(lock, timeout, []{return rx_control_info.mainWaitingState.reachRoute != ANY_ROUTE;}); + + /* check the potential errors in rx thread. */ + checkRxThreadError(); + + /* do not check interrupts when holding the lock */ + CHECK_INTERRUPTS(this->state); + + /* check to see if the task coordinator should cancel */ + CHECK_CANCEL(this->state); + + /* + * 1. NIC on master (and thus the QD connection) may become bad, check + * it. 2. Postmaster may become invalid, check it + */ + if ((retries & 0x3f) == 0) + { + checkQDConnectionAlive(); + CHECK_POSTMASTER_ALIVE(); + } + + pthread_mutex_lock(&ic_control_info.lock); + + } /* for (;;) */ + + /* We either got data, or get cancelled. We never make it out to here. */ + return; /* make GCC behave */ +} + +void +UDPConn::GetDataInBuf(GetDataLenInPacket getLen, DataBlock *data) +{ + int bytesProcessed = 0; + + Assert(data); + +#ifdef AMS_VERBOSE_LOGGING + LOG(DEBUG5, "recvtuple chunk recv bytes %d msgsize %d conn->pBuff %p conn->msgPos: %p", + this->recvBytes, this->msgSize, this->pBuff, this->msgPos); +#endif + + int ic_hdr_size = sizeof(struct icpkthdr); + + data->pos = this->msgPos + ic_hdr_size; + int rc = getLen ? getLen(data->pos, this->msgSize - ic_hdr_size) : (this->msgSize - ic_hdr_size); + if (rc < 0) + { + std::stringstream ss; + ss << "Failed to call getLen in GetDataInBuf, result: " << rc; + throw ICException(ss.str(), __FILE__, __LINE__); + } + data->len = rc; + + bytesProcessed += ic_hdr_size; + bytesProcessed += data->len; + + Assert(bytesProcessed == this->msgSize); + + this->recvBytes -= this->msgSize; + if (this->recvBytes != 0) + { +#ifdef AMS_VERBOSE_LOGGING + LOG(DEBUG5, "residual message %d bytes", this->recvBytes); +#endif + this->msgPos += this->msgSize; + } + + this->msgSize = 0; +} + +/* + * RecvTupleChunkFromAnyUDPIFC_Internal + * Receive tuple chunks from any route (connections) + */ +void +TransportEntry::RecvAny(int16 *srcRoute, + GetDataLenInPacket getLen, + DataBlock *data) +{ + int index, + activeCount = 0; + bool found = false; + + UDPConn *conn; + + index = this->scanStart; + + pthread_mutex_lock(&ic_control_info.lock); + + for (int i = 0; i < this->numConns; i++, index++) + { + if (index >= this->numConns) + index = 0; + + conn = this->GetConn(index); + if (conn->stillActive) + activeCount++; + + ic_statistics.totalRecvQueueSize += conn->pkt_q_size; + ic_statistics.recvQueueSizeCountingTime++; + + if (conn->pkt_q_size > 0) + { + found = true; + conn->prepareRxConnForRead(); + break; + } + } + + if (found) + { + pthread_mutex_unlock(&ic_control_info.lock); + + conn->GetDataInBuf(getLen, data); + + *srcRoute = conn->route; + this->scanStart = index + 1; + return; + } + + /* no data pending in our queue */ + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "RecvTupleChunkFromAnyUDPIFC(): activeCount is %d", activeCount); +#endif + if (activeCount == 0) + { + pthread_mutex_unlock(&ic_control_info.lock); + return; + } + + /* receiveChunksUDPIFC() releases ic_control_info.lock as a side-effect */ + this->receiveChunksUDPIFC(srcRoute, nullptr, getLen, data); + + this->scanStart = *srcRoute + 1; +} + +/* + * RecvTupleChunkFromUDPIFC_Internal + * Receive tuple chunks from a specific route (connection) + */ +void +TransportEntry::RecvRoute(int16 srcRoute, GetDataLenInPacket getLen, DataBlock *data) +{ + UDPConn *conn = this->GetConn(srcRoute); + +#ifdef AMS_VERBOSE_LOGGING + if (!conn->stillActive) + { + LOG(INFO, "RecvTupleChunkFromUDPIFC(): connection inactive ?!"); + } +#endif + + pthread_mutex_lock(&ic_control_info.lock); + + if (!conn->stillActive) + { + pthread_mutex_unlock(&ic_control_info.lock); + return; + } + + ic_statistics.totalRecvQueueSize += conn->pkt_q_size; + ic_statistics.recvQueueSizeCountingTime++; + + if (conn->pkt_q[conn->pkt_q_head] != NULL) + { + conn->prepareRxConnForRead(); + + pthread_mutex_unlock(&ic_control_info.lock); + + conn->GetDataInBuf(getLen, data); + + return; + } + + /* no existing data, we've got to read a packet */ + /* receiveChunksUDPIFC() releases ic_control_info.lock as a side-effect */ + int16 route; + this->receiveChunksUDPIFC(&route, conn, getLen, data); +} + +/* + * TeardownUDPIFCInterconnect_Internal + * Helper function for TeardownUDPIFCInterconnect. + * + * Developers should pay attention to: + * + * 1) Do not handle interrupts/throw errors in Teardown, otherwise, Teardown may be called twice. + * It will introduce an undefined behavior. And memory leaks will be introduced. + * + * 2) Be careful about adding elog/ereport/write_log in Teardown function, + * esp, out of HOLD_INTERRUPTS/RESUME_INTERRUPTS pair, since elog/ereport/write_log may + * handle interrupts. + * + */ +void +CChunkTransportStateImpl::teardown(bool hasErrors) +{ + bool isReceiver = false; + + /* Log the start of TeardownInterconnect. */ + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_TERSE_IC) + { + LogSeverity elevel = INFO; + + if (hasErrors || !this->activated) + { + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + elevel = INFO; + else + elevel = DEBUG1; + } + else if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + elevel = DEBUG4; + + if (elevel) + { + ICExecSlice *mySlice = &this->sliceTable->slices[this->sliceId]; + LOG(elevel, "Interconnect seg%d slice%d cleanup state: %s; setup was %s", + global_param.segindex, mySlice->sliceIndex, + hasErrors ? "hasErrors" : "normal", + this->activated ? "completed" : "exited"); + } + + /* if setup did not complete, log the slicetable */ + if (!this->activated && session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + { + //elog_node_display(DEBUG3, "local slice table", this->sliceTable, true); + //TODO: print real slicetable; + LOG(DEBUG3, "local slice table: ####"); + } + } + + /* + * The long jump with CheckXXX() in receiveChunksUDPIFC() introduces the + * locked mtx, so unlock it here. + */ + mtx.unlock(); + + /* + * add lock to protect the hash table, since background thread is still + * working. + */ + pthread_mutex_lock(&ic_control_info.lock); + + if (session_param.gp_interconnect_cache_future_packets) + cleanupStartupCache(); + + /* + * Now "normal" connections which made it through our peer-registration + * step. With these we have to worry about "in-flight" data. + */ + this->DestroySendEntries(); + + /* + * Previously, there is a piece of code that deals with pending stops. Now + * it is delegated to background rx thread which will deal with any + * mismatched packets. + */ + + /* + * cleanup all of our Receiving Motion nodes, these get closed immediately + * (the receiver know for real if they want to shut down -- they aren't + * going to be processing any more data). + */ + this->DestroyRecvEntries(&isReceiver); + + /* + * now that we've moved active rx-buffers to the freelist, we can prune + * the freelist itself + */ + while (rx_buffer_pool.count > rx_buffer_pool.maxCount) + { + icpkthdr *buf = NULL; + + /* If this happened, there are some memory leaks.. */ + if (rx_buffer_pool.freeList == NULL) + { + pthread_mutex_unlock(&ic_control_info.lock); + + std::stringstream ss; + ss << "FATAL: freelist NULL: count " << rx_buffer_pool.count + << " max " << rx_buffer_pool.maxCount << " buf " << rx_buffer_pool.freeList; + throw ICFatalException(ss.str(), __FILE__, __LINE__); + } + + buf = rx_buffer_pool.get_free(); + rx_buffer_pool.release(buf); + } + + /* + * Update the history of interconnect instance id. + */ + if (global_param.Gp_role == GP_ROLE_DISPATCH_IC) + { + rx_control_info.cursorHistoryTable.update(this->icInstanceId, 0); + } + else if (global_param.Gp_role == GP_ROLE_EXECUTE_IC) + { + rx_control_info.lastTornIcId = this->icInstanceId; + } + + if (IC_DEBUG1 >= session_param.log_min_messages) + { + LOG(DEBUG1, "Interconnect State: " + "isSender %d isReceiver %d " + "snd_queue_depth %d recv_queue_depth %d Gp_max_packet_size %d " + "UNACK_QUEUE_RING_SLOTS_NUM %d TIMER_SPAN %lld DEFAULT_RTT %d " + "hasErrors %d, ic_instance_id %d ic_id_last_teardown %d " + "snd_buffer_pool.count %d snd_buffer_pool.maxCount %d snd_sock_bufsize %d recv_sock_bufsize %d " + "snd_pkt_count %d retransmits %d crc_errors %d" + " recv_pkt_count %d recv_ack_num %d" + " recv_queue_size_avg %f" + " capacity_avg %f" + " freebuf_avg %f " + "mismatch_pkt_num %d disordered_pkt_num %d duplicated_pkt_num %d" + " cwnd %f status_query_msg_num %d", + ic_control_info.isSender, isReceiver, + session_param.Gp_interconnect_snd_queue_depth, session_param.Gp_interconnect_queue_depth, global_param.Gp_max_packet_size, + UNACK_QUEUE_RING_SLOTS_NUM, TIMER_SPAN, DEFAULT_RTT, + hasErrors, this->icInstanceId, rx_control_info.lastTornIcId, + snd_buffer_pool.count, snd_buffer_pool.maxCount, ic_control_info.socketSendBufferSize, ic_control_info.socketRecvBufferSize, + ic_statistics.sndPktNum, ic_statistics.retransmits, ic_statistics.crcErrors, + ic_statistics.recvPktNum, ic_statistics.recvAckNum, + (double) ((double) ic_statistics.totalRecvQueueSize) / ((double) ic_statistics.recvQueueSizeCountingTime), + (double) ((double) ic_statistics.totalCapacity) / ((double) ic_statistics.capacityCountingTime), + (double) ((double) ic_statistics.totalBuffers) / ((double) ic_statistics.bufferCountingTime), + ic_statistics.mismatchNum, ic_statistics.disorderedPktNum, ic_statistics.duplicatedPktNum, + snd_control_info.cwnd, ic_statistics.statusQueryMsgNum); + } + + ic_control_info.isSender = false; + memset(&ic_statistics, 0, sizeof(ICStatistics)); + + pthread_mutex_unlock(&ic_control_info.lock); + + /* reset the rx thread network error flag */ + resetRxThreadError(); + + /* free sliceTable */ + if (this->sliceTable) + { + ICSliceTable *ic_tbl = this->sliceTable; + for (int i = 0; i < ic_tbl->numSlices; ++i) + { + ICExecSlice *ic_slice = ic_tbl->slices + i; + ic_free(ic_slice->children); + ic_free(ic_slice->primaryProcesses); + } + ic_free(ic_tbl->slices); + ic_free(ic_tbl); + } + + this->activated = false; + this->sliceTable = NULL; + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_TERSE_IC) + LOG(DEBUG1, "TeardownUDPIFCInterconnect_Internal successful"); +} + +void +TransportEntry::Broadcast(DataBlock *data, int *inactiveCountPtr) +{ + int *p_inactive = inactiveCountPtr; + int index, inactive = 0; + + /* add our tcItem to each of the outgoing buffers. */ + index = Max(0, global_param.segindex); /* entry-db has -1 */ + for (int i = 0; i < this->numConns; i++, index++) + { + if (index >= this->numConns) + index = 0; + + UDPConn *conn = this->GetConn(index); + + /* only send to still interested receivers. */ + if (conn->stillActive) + { + conn->Send(data); + if (!conn->stillActive) + inactive++; + } + } + + if (p_inactive != NULL) + *p_inactive = (inactive ? 1 : 0); +} + +TransportEntry* +CChunkTransportStateImpl::GetEntry(int motNodeID, bool checkValid) +{ + if (motNodeID > 0 && motNodeID <= static_cast(this->entries_.size())) + { + TransportEntry *pEntry = this->entries_[motNodeID - 1].get(); + if (pEntry != nullptr) + { + if (!checkValid) + return pEntry; + if (pEntry->motNodeId == motNodeID && pEntry->valid) + return pEntry; + } + } + + std::stringstream ss; + ss << "ERROR, Interconnect Error: Unexpected Motion Node Id: " << motNodeID + << ". This means a motion node that wasn't setup is requesting interconnect resources."; + throw ICInvalidIndex(ss.str(), __FILE__, __LINE__); +} + +/* + * The number of the Receiving Motion may be > 1, such as + * Hashjoin + * -> Redis Motion + * ... + * -> Hash + * -> Redis Motion + * ... + */ +void +CChunkTransportStateImpl::CreateRecvEntries(ICSliceTable *sliceTable) +{ + ICExecSlice *mySlice = &sliceTable->slices[sliceTable->localSlice]; + + /* now we'll do some setup for each of our Receiving Motion Nodes. */ + for (int child_index = 0; child_index < mySlice->numChildren; ++child_index) + { + int childId = mySlice->children[child_index]; + ICExecSlice *sendSlice = &sliceTable->slices[childId]; + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "Setup recving connections: my slice %d, childId %d", mySlice->sliceIndex, childId); + + if (sendSlice->sliceIndex > static_cast(this->entries_.size())) + this->entries_.resize(sendSlice->sliceIndex); + this->checkMotNodeID(sendSlice->sliceIndex); + + std::unique_ptr pEntry = + TransportEntry::MakeRecvEntry(this, sliceTable->ic_instance_id, sendSlice, mySlice); + this->entries_[sendSlice->sliceIndex - 1] = std::move(pEntry); + } +} + +void +CChunkTransportStateImpl::CreateSendEntries(ICSliceTable *sliceTable) +{ + ICExecSlice *sendSlice = &sliceTable->slices[sliceTable->localSlice]; + + if (sendSlice->parentIndex == -1) { + ic_control_info.isSender = false; + ic_control_info.lastExpirationCheckTime = 0; + return; + } + + snd_control_info.cwnd = 0; + snd_control_info.minCwnd = 0; + snd_control_info.ssthresh = 0; + + snd_buffer_pool.init(); + initUnackQueueRing(&unack_queue_ring); + if (session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_LOSS_TIMER_IC) + initUdpManager(&mudp); + ic_control_info.isSender = true; + ic_control_info.lastExpirationCheckTime = getCurrentTime(); + ic_control_info.lastPacketSendTime = ic_control_info.lastExpirationCheckTime; + ic_control_info.lastDeadlockCheckTime = ic_control_info.lastExpirationCheckTime; + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "Interconnect seg%d slice%d setting up sending motion node", + global_param.segindex, sendSlice->sliceIndex); + + if (sendSlice->sliceIndex > static_cast(this->entries_.size())) + this->entries_.resize(sendSlice->sliceIndex); + this->checkMotNodeID(sendSlice->sliceIndex); + + ICExecSlice *recvSlice = &sliceTable->slices[sendSlice->parentIndex]; + std::unique_ptr pEntry = + TransportEntry::MakeSendEntry(this, sliceTable->ic_instance_id, sendSlice, recvSlice); + + pEntry->txfd = ICSenderSocket; + pEntry->txport = ICSenderPort; + pEntry->txfd_family = ICSenderFamily; + + snd_control_info.minCwnd = snd_control_info.cwnd; + snd_control_info.ssthresh = snd_buffer_pool.maxCount; + +#ifdef TRANSFER_PROTOCOL_STATS + trans_proto_stats.init(); +#endif + + this->entries_[sendSlice->sliceIndex - 1] = std::move(pEntry); +} + +void +CChunkTransportStateImpl::DestroyRecvEntries(bool *isReceiver) +{ + ICExecSlice *mySlice = &this->sliceTable->slices[this->sliceId]; + for (int child_index = 0; child_index < mySlice->numChildren; ++child_index) + { + int childId = mySlice->children[child_index]; + ICExecSlice *aSlice = &this->sliceTable->slices[childId]; + + /* + * First check whether the entry is initialized to avoid the potential + * errors thrown out from the removeChunkTransportState, which may + * introduce some memory leaks. + */ + int motNodeID = aSlice->sliceIndex; + if (this->entries_[motNodeID - 1] == nullptr) + continue; + + TransportEntry *pEntry = this->entries_[motNodeID - 1].get(); + Assert(motNodeID == pEntry->motNodeId); + + /* now it is safe to remove. */ + if (!pEntry->valid) + continue; + +#ifdef USE_ASSERT_CHECKING + pEntry->dumpConnections("/tmp/receiving_entries"); +#endif + /* remove it */ + pEntry->valid = false; + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "Interconnect closing connections from slice%d", aSlice->sliceIndex); + + *isReceiver = true; + + /* + * receivers know that they no longer care about data from + * below ... so we can safely discard data queued in both + * directions + */ + for (size_t i = 0; i < pEntry->conns_.size(); ++i) + { + UDPConn *conn = pEntry->conns_[i].get(); + + Assert(conn); + if (!conn) + continue; + + if (conn->cdbProc == NULL) + continue; + + /* out of memory has occurred, break out */ + if (!conn->pkt_q) + break; + + rx_buffer_pool.maxCount -= conn->pkt_q_capacity; + + ic_control_info.connHtab.remove(conn); + + /* + * ReleaseBuffer() dequeues messages and moves + * them to pBuff + */ + while (conn->pkt_q_size > 0) + conn->ReleaseBuffer(NULL); + + /* we also need to clear all the out-of-order packets */ + conn->freeDisorderedPackets(); + + /* free up the packet queue */ + ic_free(conn->pkt_q); + conn->pkt_q = NULL; + + if (global_param.destroyOpaqueDataCallback) + global_param.destroyOpaqueDataCallback(&conn->opaque_data); + + if (conn->curBuff) + { + ic_free(conn->curBuff); + conn->curBuff = NULL; + } + } // for conn + + Assert(!pEntry->conns); + } // for entry +} + +/* + * computeNetworkStatistics + * Compute the max/min/avg network statistics. + */ +static inline void +computeNetworkStatistics(uint64 value, uint64 *min, uint64 *max, double *sum) +{ + if (value >= *max) + *max = value; + if (value <= *min) + *min = value; + *sum += value; +} + +void +CChunkTransportStateImpl::DestroySendEntries() +{ + ICExecSlice *mySlice = &this->sliceTable->slices[this->sliceId]; + if (mySlice->parentIndex == -1) + return; + + /* cleanup a Sending motion node. */ + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + { + ICExecSlice *parentSlice = &this->sliceTable->slices[mySlice->parentIndex]; + LOG(DEBUG1, "Interconnect seg%d slice%d closing connections to slice%d (%d peers)", + global_param.segindex, mySlice->sliceIndex, mySlice->parentIndex, + parentSlice->numPrimaryProcesses); + } + + /* + * In the olden days, we required that the error case successfully + * transmit and end-of-stream message here. But the introduction of + * cdbdisp_check_estate_for_cancel() alleviates for the QD case, and + * the cross-connection of writer gangs in the dispatcher (propagation + * of cancel between them) fixes the I-S case. + * + * So the call to forceEosToPeers() is no longer required. + */ + int motNodeID = mySlice->sliceIndex; + if (this->entries_[motNodeID - 1] == nullptr) + return; + + TransportEntry *pEntry = this->entries_[motNodeID - 1].get(); + Assert(motNodeID == pEntry->motNodeId); + + /* now it is safe to remove. */ + if (!pEntry->valid) + return; + +#ifdef USE_ASSERT_CHECKING + pEntry->dumpConnections("/tmp/sending_entries"); +#endif + + /* remove it */ + pEntry->valid = false; + + uint64 maxRtt = 0; + double avgRtt = 0; + uint64 minRtt = ~((uint64) 0); + + uint64 maxDev = 0; + double avgDev = 0; + uint64 minDev = ~((uint64) 0); + + /* connection array allocation may fail in interconnect setup. */ + for (size_t i = 0; i < pEntry->conns_.size(); ++i) + { + UDPConn *conn = pEntry->conns_[i].get(); + + Assert(conn); + if (!conn) + continue; + + if (!conn->cdbProc) + continue; + + /* compute some statistics */ + computeNetworkStatistics(conn->rtt, &minRtt, &maxRtt, &avgRtt); + computeNetworkStatistics(conn->dev, &minDev, &maxDev, &avgDev); + + conn->sndQueue.release(false); + conn->unackQueue.release(session_param.Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_CAPACITY_IC ? false : true); + + ic_control_info.connHtab.remove(conn); + + if (conn->curBuff) + { + ic_free(conn->curBuff); + conn->curBuff = NULL; + } + } + avgRtt = avgRtt / pEntry->numConns; + avgDev = avgDev / pEntry->numConns; + + /* free all send side buffers */ + snd_buffer_pool.clean(); + + Assert(!pEntry->conns); + +#ifdef TRANSFER_PROTOCOL_STATS + trans_proto_stats.dump(); +#endif + + if (IC_DEBUG1 >= session_param.log_min_messages) + { + LOG(DEBUG1, "Interconnect State: isSender %d DEFAULT_RTT %d rtt/dev [%lu/%lu, %f/%f, %lu/%lu] ", + ic_control_info.isSender, DEFAULT_RTT, (minRtt == ~((uint64) 0) ? 0 : minRtt), + (minDev == ~((uint64) 0) ? 0 : minDev), avgRtt, avgDev, maxRtt, maxDev); + } +} + +void +CChunkTransportStateImpl::checkMotNodeID(int sendMotNodeID) +{ + Assert(sendMotNodeID > 0); + + if ((sendMotNodeID) <= 0) { + std::stringstream ss; + ss <<"ERROR, Interconnect Error: Unexpected Motion Node Id: "<entries_[sendMotNodeID - 1].get(); + if (pEntry != nullptr) { + UDPConn *conn = pEntry->conns_.size() ? pEntry->conns_[0].get() : nullptr; + + Assert(static_cast(pEntry->numConns) == pEntry->conns_.size()); + std::stringstream ss; + ss <<"ERROR, interconnect error: A HTAB entry for motion node "<conns_.size()<< + " first sock " << (conn != NULL ? conn->sockfd : -2); + throw ICInvalidIndex(ss.str(), __FILE__, __LINE__); + } +} + +CChunkTransportStateImpl::CChunkTransportStateImpl(ICSliceTable *_sliceTable) +{ + activated = false; + teardownActive = false; + + sliceTable = _sliceTable; + sliceId = sliceTable->localSlice; + icInstanceId = sliceTable->ic_instance_id; + + networkTimeoutIsLogged = false; + + clientState = NULL; +} + +ICChunkTransportState* +CChunkTransportStateImpl::SetupUDP(ICSliceTable *sliceTable, SessionMotionLayerIPCParam *param) +{ + if (param) + memcpy(&session_param, param, sizeof(*param)); + + /* + * The rx-thread might have set an error since last teardown, + * technically it is not part of current query, discard it directly. + */ + resetRxThreadError(); + + try { + ICChunkTransportState *state = CChunkTransportStateImpl::setup(sliceTable); + + /* Internal error if we locked the mutex but forgot to unlock it. */ + Assert(pthread_mutex_unlock(&ic_control_info.lock) != 0); + + return state; + + } catch (...) { + /* + * Remove connections from hash table to avoid packet handling in the + * rx pthread, else the packet handling code could use memory whose + * context (InterconnectContext) would be soon reset - that could + * panic the process. + */ + ConnHashTable *ht = &ic_control_info.connHtab; + + for (int i = 0; i < ht->size; i++) + { + struct ConnHtabBin *trash; + UDPConn *conn = NULL; + + trash = ht->table[i]; + while (trash != NULL) + { + conn = trash->conn; + /* Get trash at first as trash will be pfree-ed in remove. */ + trash = trash->next; + ht->remove(conn); + } + } + pthread_mutex_unlock(&ic_control_info.lock); + + throw; + } +} + +void +CChunkTransportStateImpl::TeardownUDP(bool hasErrors) +{ + try { + CChunkTransportStateImpl::state_ = nullptr; + this->teardown(hasErrors); + delete this; + Assert(pthread_mutex_unlock(&ic_control_info.lock) != 0); + } catch (...) { + pthread_mutex_unlock(&ic_control_info.lock); + throw; + } +} + +CChunkTransportState ** +CChunkTransportStateImpl::GetTransportState() +{ + return &CChunkTransportStateImpl::state_; +} + +void +CChunkTransportStateImpl::RecvRoute(int16 motNodeID, + int16 srcRoute, + GetDataLenInPacket getLen, + DataBlock *data) +{ + try { + TransportEntry *pEntry = GetEntry(motNodeID, true); + pEntry->RecvRoute(srcRoute, getLen, data); + + /* error if mutex still held (debug build only) */ + Assert(pthread_mutex_unlock(&ic_control_info.lock) != 0); + } catch (...) { + pthread_mutex_unlock(&ic_control_info.lock); + throw; + } +} + +void +CChunkTransportStateImpl::RecvAny(int16 motNodeID, + int16 *srcRoute, + GetDataLenInPacket getLen, + DataBlock *data) +{ + try { + TransportEntry *pEntry = GetEntry(motNodeID, true); + pEntry->RecvAny(srcRoute, getLen, data); + + /* error if mutex still held (debug build only) */ + Assert(pthread_mutex_unlock(&ic_control_info.lock) != 0); + } catch (...) { + pthread_mutex_unlock(&ic_control_info.lock); + throw; + } +} + +void +CChunkTransportStateImpl::SendEOS(int motNodeID, DataBlock *data) +{ + int retry = 0; + int activeCount = 0; + int timeout = 0; + + TransportEntry *pEntry = GetEntry(motNodeID, true); + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "Interconnect seg%d slice%d sending end-of-stream to slice%d", + global_param.segindex, motNodeID, pEntry->recvSlice->sliceIndex); + + /* + * we want to add our tcItem onto each of the outgoing buffers -- this is + * guaranteed to leave things in a state where a flush is *required*. + */ + pEntry->Broadcast(data, NULL); + + pEntry->sendingEos = true; + + uint64 now = getCurrentTime(); + + /* now flush all of the buffers. */ + for (int i = 0; i < pEntry->numConns; i++) + { + UDPConn *conn = pEntry->GetConn(i); + if (conn->stillActive) + { + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "sent eos to route %d tuplecount %d seq %d flags 0x%x stillActive %s icId %d %d", + conn->route, conn->tupleCount, conn->conn_info.seq, + conn->conn_info.flags, (conn->stillActive ? "true" : "false"), + conn->conn_info.icId, conn->msgSize); + + /* prepare this for transmit */ + conn->conn_info.flags |= UDPIC_FLAGS_EOS; + + /* place it into the send queue */ + conn->prepareXmit(); + conn->sndQueue.append(conn->curBuff); + conn->sendBuffers(); + conn->curBuff = NULL; + conn->pBuff = NULL; + + conn->tupleCount = 0; + conn->msgSize = sizeof(conn->conn_info); + conn->deadlockCheckBeginTime = now; + + activeCount++; + } + } + + /* + * Now waiting for acks from receivers. + * + * Note here waiting is done in a separate phase from the EOS sending + * phase to make the processing faster when a lot of connections are slow + * and have frequent packet losses. In fault injection tests, we found + * this. + * + */ + while (activeCount > 0) + { + activeCount = 0; + + for (int i = 0; i < pEntry->numConns; i++) + { + UDPConn *conn = pEntry->GetConn(i); + if (conn->stillActive) + { + retry = 0; + ic_control_info.lastPacketSendTime = 0; + + /* wait until this queue is emptied */ + while (conn->unackQueue.length() > 0 || + conn->sndQueue.length() > 0) + { + timeout = conn->computeTimeout(retry); + + if (pEntry->pollAcks(timeout)) + pEntry->handleAcks(true); + + conn->checkExceptions(retry++, timeout); + + if (retry >= MAX_TRY) + { + if (conn->unackQueue.length() == 0) + conn->sendBuffers(); + break; + } + } + + if ((!conn->cdbProc) || (conn->unackQueue.length() == 0 && + conn->sndQueue.length() == 0)) + { + conn->state = mcsEosSent; + conn->stillActive = false; + } + else + activeCount++; + } + } + } + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "SendEOSleaving, activeCount %d", activeCount); +} + +void +CChunkTransportStateImpl::SendStop(int16 motNodeID) +{ + Assert(this->activated); + + TransportEntry *pEntry = GetEntry(motNodeID, true); + + /* + * Note: we're only concerned with receivers here. + */ + pthread_mutex_lock(&ic_control_info.lock); + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "Interconnect needs no more input from slice%d; notifying senders to stop.", motNodeID); + + for (int i = 0; i < pEntry->numConns; i++) + { + UDPConn *conn = pEntry->GetConn(i); + + /* + * Note here, the stillActive flag of a connection may have been set + * to false by DeactiveConn. + */ + if (conn->stillActive) + { + if (conn->conn_info.flags & UDPIC_FLAGS_EOS) + { + /* + * we have a queued packet that has EOS in it. We've acked it, + * so we're done + */ + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "do sendstop: already have queued EOS packet, we're done. node %d route %d", motNodeID, i); + + conn->stillActive = false; + + /* need to drop the queues in the teardown function. */ + while (conn->pkt_q_size > 0) + conn->ReleaseBuffer(NULL); + } + else + { + conn->stopRequested = true; + conn->conn_info.flags |= UDPIC_FLAGS_STOP; + + /* + * The peer addresses for incoming connections will not be set + * until the first packet has arrived. However, when the lower + * slice does not have data to send, the corresponding peer + * address for the incoming connection will never be set. We + * will skip sending ACKs to those connections. + */ + +#ifdef FAULT_INJECTOR + if (FaultInjector_InjectFaultIfSet( + "interconnect_stop_ack_is_lost", + DDLNotSpecified, + "" /* databaseName */ , + "" /* tableName */ ) == FaultInjectorTypeSkip) + { + continue; + } +#endif + + if (conn->peer.ss_family == AF_INET || conn->peer.ss_family == AF_INET6) + { + uint32 seq = conn->conn_info.seq > 0 ? conn->conn_info.seq - 1 : 0; + + conn->sendAck(UDPIC_FLAGS_STOP | UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, seq, seq); + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "sent stop message. node %d route %d seq %d", motNodeID, i, seq); + } + else + { + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG1, "first packet did not arrive yet. don't sent stop message. node %d route %d", motNodeID, i); + } + } + } + } + pthread_mutex_unlock(&ic_control_info.lock); +} + +void +TransportEntry::Send(int16 targetRoute, DataBlock *db, bool broadcast, int *recount) +{ +#ifdef AMS_VERBOSE_LOGGING + LOG(DEBUG5, "SendTupleChunkToAMS: chunk length %d", db->len); +#endif + + if (broadcast) + { + this->Broadcast(db, recount); + } + else + { + if (targetRoute < 0 || targetRoute >= this->numConns) + { + std::stringstream ss; + ss << "FATAL: SendTupleChunkToAMS: targetRoute is " << targetRoute + << ", must be between 0 and " << this->numConns << " ."; + throw ICFatalException(ss.str(), __FILE__, __LINE__); + } + + /* handle pt-to-pt message. Primary */ + UDPConn *conn = this->GetConn(targetRoute); + + /* only send to interested connections */ + if (conn->stillActive) + { + conn->Send(db); + if (!conn->stillActive) + *recount = 1; + } + /* in 4.0 logical mirror xmit eliminated. */ + } +} + +bool +TransportEntry::SendData(int16 targetRoute, DataBlock *pblocks, int num, bool broadcast) +{ + int recount = 0; + + int payload = global_param.Gp_max_packet_size - sizeof(icpkthdr); + + /* + * tcItem can actually be a chain of tcItems. we need to send out all of + * them. + */ + for (int i = 0; i < num; ++i) + { + DataBlock db = *(pblocks + i); + while (db.len > 0) + { + DataBlock toSend; + toSend.len = db.len > payload ? payload : db.len; + toSend.pos = db.pos; + this->Send(targetRoute, &toSend, broadcast, &recount); + db.len -= toSend.len; + db.pos += toSend.len; + } + } + + if (recount == 0) + return true; + + /* if we don't have any connections active, return false */ + int i = 0; + for (i = 0; i < this->numConns; i++) + { + UDPConn *conn = this->GetConn(i); + if (conn->stillActive) + break; + } + + /* if we found an active connection we're not done */ + return (i < this->numConns); + +} + +bool +CChunkTransportStateImpl::SendData(int16 motNodeID, + int16 targetRoute, + DataBlock *pblocks, + int num, + bool broadcast) +{ + TransportEntry *pEntry = GetEntry(motNodeID, true); + return pEntry->SendData(targetRoute, pblocks, num, broadcast); +} + +void +CChunkTransportStateImpl::GetFreeSpace(int16 motNodeID, int16 targetRoute, BufferBlock *b) +{ + Assert(b != NULL); + + TransportEntry *pEntry = GetEntry(motNodeID, true); + + /* handle pt-to-pt message. Primary */ + UDPConn *conn = pEntry->GetConn(targetRoute); + + b->pos = NULL; + b->len = 0; + if (conn->stillActive) + { + b->pos = conn->pBuff + conn->msgSize; + Assert(global_param.Gp_max_packet_size >= conn->msgSize); + b->len = global_param.Gp_max_packet_size - conn->msgSize; + } +} + +void +CChunkTransportStateImpl::ReduceFreeSpace(int16 motNodeID, int16 targetRoute, int length) +{ + TransportEntry *pEntry = GetEntry(motNodeID, true); + + /* handle pt-to-pt message. Primary */ + UDPConn *conn = pEntry->GetConn(targetRoute); + + /* only send to interested connections */ + if (conn->stillActive) + { + Assert(conn->msgSize + length <= global_param.Gp_max_packet_size); + conn->msgSize += length; + conn->tupleCount++; + } +} + +void +CChunkTransportStateImpl::ReleaseAndAck(int motNodeID, int route) +{ + AckSendParam param; + + TransportEntry *pEntry = GetEntry(motNodeID, true); + UDPConn *conn = pEntry->GetConn(route); + + memset(¶m, 0, sizeof(AckSendParam)); + + pthread_mutex_lock(&ic_control_info.lock); + + if (conn->pBuff != NULL) + { + conn->ReleaseBuffer(¶m); + } + else + { + pthread_mutex_unlock(&ic_control_info.lock); + throw ICFatalException("FATAL: Interconnect error: tried to release a NULL buffer", __FILE__, __LINE__); + } + + pthread_mutex_unlock(&ic_control_info.lock); + + /* + * real ack sending is after lock release to decrease the lock holding + * time. + */ + if (param.msg.len != 0) + UDPConn::sendAckWithParam(¶m); +} + +void +CChunkTransportStateImpl::DeactiveRoute(int motNodeID, int srcRoute, const char *reason) +{ + TransportEntry *pEntry = GetEntry(motNodeID, true); + UDPConn *conn = pEntry->GetConn(srcRoute); + + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + { + LOG(DEBUG3, "Interconnect finished receiving from seg%d slice%d %s pid=%d sockfd=%d; %s", + conn->remoteContentId, pEntry->sendSlice->sliceIndex, conn->remoteHostAndPort, + conn->cdbProc->pid, conn->sockfd, reason); + } + +#ifdef AMS_VERBOSE_LOGGING + LOG(INFO, "deregisterReadInterest set stillactive = false for node %d route %d (%s)", motNodeID, srcRoute, reason); +#endif + + conn->DeactiveConn(); +} + +void* +CChunkTransportStateImpl::GetOpaqueDataInConn(int16 motNodeID, int16 targetRoute) +{ + TransportEntry *pEntry = this->GetEntry(motNodeID, true); + UDPConn *conn = pEntry->GetConn(targetRoute); + return conn->opaque_data; +} + +int32* +CChunkTransportStateImpl::GetSentRecordTypmodInConn(int16 motNodeID, int16 targetRoute) +{ + TransportEntry *pEntry = this->GetEntry(motNodeID, true); + UDPConn *conn = pEntry->GetConn(targetRoute); + return &conn->sent_record_typmod; +} + +int +CChunkTransportStateImpl::GetConnNum(int motNodeID) { + TransportEntry *pEntry = this->GetEntry(motNodeID, true); + return pEntry->conns_.size(); +} + +void +CChunkTransportStateImpl::NotifyQuit() { + thread_quit = true; +} + +void +CChunkTransportStateImpl::SetVectorEngineAsUser() { + vector_engine_is_user = true; +} + +CChunkTransportStateImpl* ToDerived(CChunkTransportState *ptr) { + return static_cast(ptr); +} + +/* + * C++ interface wrapper class based on class CChunkTransportStateImpl; + */ +ICChunkTransportState* +CChunkTransportState::SetupUDP(ICSliceTable *sliceTable, + SessionMotionLayerIPCParam *param) { + return CChunkTransportStateImpl::SetupUDP(sliceTable, param); +} + +void +CChunkTransportState::TeardownUDP(bool hasErrors) { + ToDerived(this)->TeardownUDP(hasErrors); +} + +void +CChunkTransportState::RecvRoute(int16 motNodeID, int16 srcRoute, + GetDataLenInPacket getLen, DataBlock *data) { + ToDerived(this)->RecvRoute(motNodeID, srcRoute, getLen, data); +} + +void +CChunkTransportState::RecvAny(int16 motNodeID, int16 *srcRoute, + GetDataLenInPacket getLen, DataBlock *data) { + ToDerived(this)->RecvAny(motNodeID, srcRoute, getLen, data); +} + +void +CChunkTransportState::SendEOS(int motNodeID, DataBlock *data) { + ToDerived(this)->SendEOS(motNodeID, data); +} + +void +CChunkTransportState::SendStop(int16 motNodeID) { + return ToDerived(this)->SendStop(motNodeID); +} + +bool +CChunkTransportState::SendData(int16 motNodeID, int16 targetRoute, DataBlock *pblocks, + int num, bool broadcast) { + return ToDerived(this)->SendData(motNodeID, targetRoute, pblocks, num, broadcast); +} + +void +CChunkTransportState::GetFreeSpace(int16 motNodeID, int16 targetRoute, BufferBlock *b) { + return ToDerived(this)->GetFreeSpace(motNodeID, targetRoute, b); +} + +void +CChunkTransportState::ReduceFreeSpace(int16 motNodeID, int16 targetRoute, int length) { + return ToDerived(this)->ReduceFreeSpace(motNodeID, targetRoute, length); +} + +void +CChunkTransportState::ReleaseAndAck(int motNodeID, int route) { + return ToDerived(this)->ReleaseAndAck(motNodeID, route); +} + +void +CChunkTransportState::DeactiveRoute(int motNodeID, int srcRoute, const char *reason) { + return ToDerived(this)->DeactiveRoute(motNodeID, srcRoute, reason); +} + +void* +CChunkTransportState::GetOpaqueDataInConn(int16 motNodeID, int16 targetRoute) { + return ToDerived(this)->GetOpaqueDataInConn(motNodeID, targetRoute); +} + +int32* +CChunkTransportState::GetSentRecordTypmodInConn(int16 motNodeID, int16 targetRoute) { + return ToDerived(this)->GetSentRecordTypmodInConn(motNodeID, targetRoute); +} + +int +CChunkTransportState::GetConnNum(int motNodeID) { + return ToDerived(this)->GetConnNum(motNodeID); +} + +void +CChunkTransportState::NotifyQuit() { + return ToDerived(this)->NotifyQuit(); +} + +void +CChunkTransportState::SetVectorEngineAsUser() { + return ToDerived(this)->SetVectorEngineAsUser(); +} + +CChunkTransportState** +CChunkTransportState::GetTransportState() { + return CChunkTransportStateImpl::GetTransportState(); +} + +/* + * C interface wrapper of UDP implement based C++ interface calss CChunkTransportState. + */ +#ifdef __cplusplus +extern "C" { +#endif + +static void handleException() +{ + try{ + throw; + } catch (const std::bad_alloc &e) { + SetLastError(LEVEL_ERROR, "out of memory"); + } catch (const ICFatalException &e) { + SetLastError(LEVEL_FATAL, e.msg()); + } catch (const ICException &e) { + SetLastError(LEVEL_ERROR, e.msg()); + } catch (const std::exception &e) { + SetLastError(LEVEL_ERROR, e.what()); + } catch (...) { + SetLastError(LEVEL_ERROR, "something unknown wrong happened!"); + } +} + +ICChunkTransportState* +UDP2_SetupUDP(ICSliceTable *sliceTable, SessionMotionLayerIPCParam *param) +{ + try { + return CChunkTransportState::SetupUDP(sliceTable, param); + } catch (...) { + handleException(); + } + + return NULL; +} + +/* + * TeardownUDPIFCInterconnect + * Tear down UDP interconnect. + * + * This function is called to release the resources used by interconnect. + */ +void +UDP2_TeardownUDP(ICChunkTransportState *transportStates, + bool hasErrors) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + cstate->TeardownUDP(hasErrors); + } catch (...) { + handleException(); + } +} + +/* + * RecvTupleChunkFromUDPIFC + * Receive tuple chunks from a specific route (connection) + */ +void +UDP2_RecvRoute(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 srcRoute, + GetDataLenInPacket getLen, + DataBlock *data) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + cstate->RecvRoute(motNodeID, srcRoute, getLen, data); + } catch (...) { + handleException(); + } +} + +/* + * RecvTupleChunkFromAnyUDPIFC + * Receive tuple chunks from any route (connections) + */ +void +UDP2_RecvAny(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 *srcRoute, + GetDataLenInPacket getLen, + DataBlock *data) + +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + cstate->RecvAny(motNodeID, srcRoute, getLen, data); + } catch (...) { + handleException(); + } +} + +/* + * SendEOS + * broadcast eos messages to receivers. + */ +void +UDP2_SendEOS(ICChunkTransportState *transportStates, + int motNodeID, + DataBlock *data) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + cstate->SendEOS(motNodeID, data); + } catch (...) { + handleException(); + } +} + +void +UDP2_SendStop(ICChunkTransportState *transportStates, + int16 motNodeID) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + cstate->SendStop(motNodeID); + } catch (...) { + handleException(); + } +} + +bool +UDP2_SendData(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + DataBlock *pblocks, + int num, + bool broadcast) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + return cstate->SendData(motNodeID, targetRoute, pblocks, num, broadcast); + } catch (...) { + handleException(); + } + + return false; +} + +/* + * The fetches a direct pointer into our transmit buffers, along with + * an indication as to how much data can be safely shoved into the + * buffer (started at the pointed location). + * + * This works a lot like SendTupleChunkToAMS(). + */ +void +UDP2_GetFreeSpace(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + BufferBlock *b) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + cstate->GetFreeSpace(motNodeID, targetRoute, b); + } catch (...) { + handleException(); + } +} + +void +UDP2_ReduceFreeSpace(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + int length) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + cstate->ReduceFreeSpace(motNodeID, targetRoute, length); + } catch (...) { + handleException(); + } +} + +/* + * SendAck + * + * The cdbmotion code has discarded our pointer to the motion-conn + * structure, but has enough info to fully specify it. + */ +void +UDP2_ReleaseAndAck(ICChunkTransportState *transportStates, + int motNodeID, + int route) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + cstate->ReleaseAndAck(motNodeID, route); + } catch (...) { + handleException(); + } +} + +void +UDP2_DeactiveRoute(ICChunkTransportState *transportStates, int motNodeID, int + srcRoute, const char *reason) { Assert(transportStates); + + try { CChunkTransportState *cstate = + static_cast(transportStates); + cstate->DeactiveRoute(motNodeID, srcRoute, reason); + } catch (...) { + handleException(); + } +} + +void* +UDP2_GetOpaqueDataInConn(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + return cstate->GetOpaqueDataInConn(motNodeID, targetRoute); + } catch (...) { + handleException(); + } + + return NULL; +} + +int32* +UDP2_GetSentRecordTypmodInConn(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute) +{ + Assert(transportStates); + + try { + CChunkTransportState *cstate = static_cast(transportStates); + return cstate->GetSentRecordTypmodInConn(motNodeID, targetRoute); + } catch (...) { + handleException(); + } + + return NULL; +} + +void +UDP2_InitUDPIFC(GlobalMotionLayerIPCParam *param) +{ + if (param) + { + if (global_param.interconnect_address) + free(global_param.interconnect_address); + + memcpy(&global_param, param, sizeof(*param)); + global_param.interconnect_address = strdup(param->interconnect_address); + } + + try { + InitMotionUDPIFC(&UDP_listenerFd, &udp_listener_port); + + if (IC_DEBUG1 >= session_param.log_min_messages) + LOG(DEBUG1, "Interconnect listening on udp port %d ", udp_listener_port); + + } catch (...) { + handleException(); + } +} + +void +UDP2_CleanUpUDPIFC(void) +{ + if (session_param.gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG_IC) + LOG(DEBUG3, "Cleaning Up Motion Layer IPC..."); + + try { + CleanupMotionUDPIFC(); + } catch (...) { + handleException(); + } + + if (UDP_listenerFd >= 0) + closesocket(UDP_listenerFd); + + /* be safe and reset global state variables. */ + udp_listener_port = 0; + UDP_listenerFd = -1; +} + +void +UDP2_WaitQuitUDPIFC(void) +{ + /* + * Just in case ic thread is waiting on the locks. + */ + pthread_mutex_unlock(&ic_control_info.lock); + + ic_atomic_write_u32(&ic_control_info.shutdown, 1); + + if (ic_control_info.threadCreated) + { + SendDummyPacket(); + pthread_join(ic_control_info.threadHandle, NULL); + } + ic_control_info.threadCreated = false; +} + +uint32 +UDP2_GetActiveConns(void) +{ + return ic_statistics.activeConnectionsNum; +} + +int +UDP2_GetICHeaderSizeUDP(void) +{ + return sizeof(struct icpkthdr); +} + +int32 +UDP2_GetListenPortUDP(void) +{ + return udp_listener_port; +} + +#ifdef __cplusplus +} // extern "C" +#endif \ No newline at end of file diff --git a/contrib/udp2/ic_common/udp2/ic_udp2.h b/contrib/udp2/ic_common/udp2/ic_udp2.h new file mode 100644 index 00000000000..e49da698b79 --- /dev/null +++ b/contrib/udp2/ic_common/udp2/ic_udp2.h @@ -0,0 +1,102 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_udp2.h + * + * IDENTIFICATION + * contrib/udp2/ic_common/udp2/ic_udp2.h + * + *------------------------------------------------------------------------- + */ + +#ifndef IC_UDP2_H +#define IC_UDP2_H + +#include "ic_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern void UDP2_InitUDPIFC(struct GlobalMotionLayerIPCParam *param); +extern void UDP2_CleanUpUDPIFC(void); +extern void UDP2_WaitQuitUDPIFC(void); + +extern ICChunkTransportState* UDP2_SetupUDP(ICSliceTable *sliceTable, + SessionMotionLayerIPCParam *param); +extern void UDP2_TeardownUDP(ICChunkTransportState *transportStates, + bool hasErrors); + +// recv +extern void UDP2_RecvRoute(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 srcRoute, + GetDataLenInPacket getLen, + DataBlock *data); +extern void UDP2_RecvAny(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 *srcRoute, + GetDataLenInPacket getLen, + DataBlock *data); +extern void UDP2_SendStop(ICChunkTransportState *transportStates, int16 motNodeID); +extern void UDP2_ReleaseAndAck(ICChunkTransportState *transportStates, + int motNodeID, + int route); +extern void UDP2_DeactiveRoute(ICChunkTransportState *transportStates, + int motNodeID, + int srcRoute, + const char *reason); + +// send +extern void UDP2_SendEOS(ICChunkTransportState *transportStates, + int motNodeID, + DataBlock *data); +extern bool UDP2_SendData(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + DataBlock *pblocks, + int num, + bool broadcast); +extern void UDP2_GetFreeSpace(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + BufferBlock *b); +extern void UDP2_ReduceFreeSpace(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + int length); + + +// utility func +extern void* UDP2_GetOpaqueDataInConn(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute); +extern int32* UDP2_GetSentRecordTypmodInConn(ICChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute); + +extern uint32 UDP2_GetActiveConns(void); +extern int UDP2_GetICHeaderSizeUDP(void); +extern int32 UDP2_GetListenPortUDP(void); + +#ifdef __cplusplus +} +#endif + +#endif // IC_UDP2_H \ No newline at end of file diff --git a/contrib/udp2/ic_common/udp2/ic_udp2.hpp b/contrib/udp2/ic_common/udp2/ic_udp2.hpp new file mode 100644 index 00000000000..a7e6356fb9c --- /dev/null +++ b/contrib/udp2/ic_common/udp2/ic_udp2.hpp @@ -0,0 +1,68 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_udp2.hpp + * + * IDENTIFICATION + * contrib/udp2/ic_common/udp2/ic_udp2.hpp + * + *------------------------------------------------------------------------- + */ +#ifndef IC_UDP2_HPP +#define IC_UDP2_HPP + +#include +#include +#include +#include +#include + +#include "ic_types.h" +#include "ic_except.hpp" + +struct CChunkTransportState : public ICChunkTransportState +{ +public: + static ICChunkTransportState* SetupUDP(ICSliceTable *sliceTable, SessionMotionLayerIPCParam *param); + void TeardownUDP(bool hasErrors); + + void RecvRoute(int16 motNodeID, int16 srcRoute, GetDataLenInPacket getLen, DataBlock *data); + void RecvAny(int16 motNodeID, int16 *srcRoute, GetDataLenInPacket getLen, DataBlock *data); + void ReleaseAndAck(int motNodeID, int route); + void SendStop(int16 motNodeID); + void DeactiveRoute(int motNodeID, int srcRoute, const char *reason); + + void SendEOS(int motNodeID, DataBlock *data); + bool SendData(int16 motNodeID, int16 targetRoute, DataBlock *pblocks, int num, bool broadcast); + void GetFreeSpace(int16 motNodeID, int16 targetRoute, BufferBlock *b); + void ReduceFreeSpace(int16 motNodeID, int16 targetRoute, int length); + + void* GetOpaqueDataInConn(int16 motNodeID, int16 targetRoute); + int32* GetSentRecordTypmodInConn(int16 motNodeID, int16 targetRoute); + + int GetConnNum(int motNodeID); + + static CChunkTransportState** GetTransportState(); + + /* APIs for vector engine */ + void NotifyQuit(); + void SetVectorEngineAsUser(); +}; + +#endif // IC_UDP2_HPP \ No newline at end of file diff --git a/contrib/udp2/ic_common/udp2/ic_udp2_internal.hpp b/contrib/udp2/ic_common/udp2/ic_udp2_internal.hpp new file mode 100644 index 00000000000..2602133a9e5 --- /dev/null +++ b/contrib/udp2/ic_common/udp2/ic_udp2_internal.hpp @@ -0,0 +1,1319 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_udp2_internal.hpp + * + * IDENTIFICATION + * contrib/udp2/ic_common/udp2/ic_udp2_internal.hpp + * + *------------------------------------------------------------------------- + */ +#ifndef IC_UDP2_INTERNAL_HPP +#define IC_UDP2_INTERNAL_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ic_udp2.hpp" +#include "ic_utility.hpp" + +namespace { + +typedef enum MotionConnState +{ + mcsNull, + mcsAccepted, + mcsSetupOutgoingConnection, + mcsConnecting, + mcsRecvRegMsg, + mcsSendRegMsg, + mcsStarted, + mcsEosSent +} MotionConnState; + +/* + * Structure used for keeping track of a pt-to-pt connection between two + * Cdb Entities (either QE or QD). + */ +typedef struct MotionConn +{ + /* socket file descriptor. */ + int sockfd; + + /* pointer to the data buffer. */ + uint8 *pBuff; + + /* size of the message in the buffer, if any. */ + int32 msgSize; + + /* position of message inside of buffer, "cursor" pointer */ + uint8 *msgPos; + + /* + * recv bytes: we can have more than one message/message fragment in recv + * queue at once + */ + int32 recvBytes; + + int tupleCount; + + /* + * false means 1) received a stop message and has handled it. 2) received + * EOS message or sent out EOS message 3) received a QueryFinishPending + * notify and has handled it. + */ + bool stillActive; + + /* + * used both by motion sender and motion receiver + * + * sender: true means receiver don't need to consume tuples any more, + * sender is also responsible to send stop message to its senders. + * + * receiver: true means have sent out a stop message to its senders. The + * stop message might be lost, stopRequested can also tell sender that no + * more data needed in the ack message. + */ + bool stopRequested; + + MotionConnState state; + + ICCdbProcess *cdbProc; + int remoteContentId; + char remoteHostAndPort[128]; /* Numeric IP addresses should never + * be longer than about 50 chars, but + * play it safe */ + + void *opaque_data; + + /* + * used by the sender. + * + * the typmod of last sent record type in current connection, + * if the connection is for broadcasting then we only check + * and update this attribute on connection 0. + */ + int32 sent_record_typmod; + +} MotionConn; + +/* + * Used to organize all of the information for a given motion node. + */ +typedef struct ChunkTransportStateEntry +{ + int motNodeId; + bool valid; + + /* Connection array + * + * MUST pay attention: use getMotionConn to get MotionConn. + * must not use `->conns[index]` to get MotionConn. Because the struct + * MotionConn is a base structure for MotionConnTCP and + * MotionConnUDP. After connection setup, the `conns` will be fill + * with MotionConnUDP/MotionConnTCP, but the pointer still is + * MotionConn which should use `CONTAINER_OF` to get the real object. + */ + MotionConn *conns; + int numConns; + + int scanStart; + + /* slice table entries */ + struct ICExecSlice *sendSlice; + struct ICExecSlice *recvSlice; + +} ChunkTransportStateEntry; + +typedef struct icpkthdr +{ + int32 motNodeId; + + /* + * three pairs which seem useful for identifying packets. + * + * MPP-4194: It turns out that these can cause collisions; but the high + * bit (1<<31) of the dstListener port is now used for disambiguation with + * mirrors. + */ + int32 srcPid; + int32 srcListenerPort; + + int32 dstPid; + int32 dstListenerPort; + + int32 sessionId; + int32 icId; + + int32 recvSliceIndex; + int32 sendSliceIndex; + int32 srcContentId; + int32 dstContentId; + + /* MPP-6042: add CRC field */ + uint32 crc; + + /* packet specific info */ + int32 flags; + uint32 len; + + /* + * The usage of seq and extraSeq field + * a) In a normal DATA packet + * seq -> the data packet sequence number + * extraSeq -> not used + * b) In a normal ACK message (UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY) + * seq -> the largest seq of the continuously cached packets + * sometimes, it is special, for exampke, conn req ack, mismatch ack. + * extraSeq -> the largest seq of the consumed packets + * c) In a start race NAK message (UPDIC_FLAGS_NAK) + * seq -> the seq from the pkt + * extraSeq -> the extraSeq from the pkt + * d) In a DISORDER message (UDPIC_FLAGS_DISORDER) + * seq -> packet sequence number that triggers the disorder message + * extraSeq -> the largest seq of the received packets + * e) In a DUPLICATE message (UDPIC_FLAGS_DUPLICATE) + * seq -> packet sequence number that triggers the duplicate message + * extraSeq -> the largest seq of the continuously cached packets + * f) In a stop messege (UDPIC_FLAGS_STOP | UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY) + * seq -> the largest seq of the continuously cached packets + * extraSeq -> the largest seq of the continuously cached packets + * + * + * NOTE that: EOS/STOP flags are often saved in conn_info structure of a connection. + * It is possible for them to be sent together with other flags. + * + */ + uint32 seq; + uint32 extraSeq; + uint64_t send_time; + uint64_t recv_time; + uint8_t retry_times; +} icpkthdr; + +typedef struct ICBuffer ICBuffer; +typedef struct ICBufferLink ICBufferLink; + +typedef enum ICBufferListType +{ + ICBufferListType_Primary, + ICBufferListType_Secondary, + ICBufferListType_UNDEFINED +} ICBufferListType; + +struct ICBufferLink +{ + ICBufferLink *next; + ICBufferLink *prev; +}; + +/* + * ICBufferList + * ic buffer list data structure. + * + * There are two kinds of lists. The first kind of list uses the primary next/prev pointers. + * And the second kind uses the secondary next/prev pointers. + */ +struct ICBufferList +{ + int len; + ICBufferListType type; /* primary or secondary */ + + ICBufferLink head; + +#if defined(USE_ASSERT_CHECKING) || defined(AMS_VERBOSE_LOGGING) + void icBufferListLog(); +#endif + +#ifdef USE_ASSERT_CHECKING + void icBufferListCheck(const char *prefix); +#endif + + void init(ICBufferListType type); + void destroy(); + + bool is_head(ICBufferLink *link); + int length(); + ICBufferLink* first(); + + ICBuffer* append(ICBuffer *buf); + ICBuffer* remove(ICBuffer *buf); + ICBuffer* pop(); + + void release(bool inExpirationQueue); + + void dump_to_file(FILE *ofile); +}; + +#define GET_ICBUFFER_FROM_PRIMARY(ptr) CONTAINER_OF(ptr, ICBuffer, primary) +#define GET_ICBUFFER_FROM_SECONDARY(ptr) CONTAINER_OF(ptr, ICBuffer, secondary) + +/* + * ICBuffer + * interconnect buffer data structure. + * + * In some cases, an ICBuffer may exists in two lists/queues, + * thus it has two sets of pointers. For example, an ICBuffer + * can exist in an unack queue and an expiration queue at the same time. + * + * It is important to get the ICBuffer address when we iterate a list of + * ICBuffers through primary/secondary links. The Macro GET_ICBUFFER_FROM_PRIMARY + * and GET_ICBUFFER_FROM_SECONDARY are for this purpose. + * + */ +struct ICBuffer +{ + /* primary next and prev pointers */ + ICBufferLink primary; + + /* secondary next and prev pointers */ + ICBufferLink secondary; + + /* connection that this buffer belongs to */ + MotionConn *conn; + + /* + * Three fields for expiration processing + * + * sentTime - the time this buffer was sent nRetry - the number of send + * retries unackQueueRingSlot - unack queue ring slot index + */ + uint64 sentTime; + int32 nRetry; + int32 unackQueueRingSlot; + + /* real data */ + icpkthdr pkt[0]; +}; + +static inline void* +ic_malloc(size_t size) +{ + return malloc(size); +} + +static inline void* +ic_malloc0(size_t size) +{ + void *rs = ic_malloc(size); + if (rs) + memset(rs, 0, size); + return rs; +} + +static inline void +ic_free(void *p) +{ + free(p); +} + +static inline void +ic_free_clean(void **p) +{ + ic_free(*p); + *p = NULL; +} + +static inline void +ic_usleep(long microsec) +{ + if (microsec > 0) + { + struct timeval delay; + + delay.tv_sec = microsec / 1000000L; + delay.tv_usec = microsec % 1000000L; + (void) select(0, NULL, NULL, NULL, &delay); + } +} + +/* + * Put socket into nonblock mode. + * Returns true on success, false on failure. + */ +static inline bool +ic_set_noblock(int sock) +{ + int flags; + + flags = fcntl(sock, F_GETFL); + if (flags < 0) + return false; + if (fcntl(sock, F_SETFL, (flags | O_NONBLOCK)) == -1) + return false; + return true; +} + +/* ic_atomic_xxx */ +typedef struct ic_atomic_uint32 +{ + volatile uint32 value; +} ic_atomic_uint32; + +static inline void +ic_atomic_init_u32(volatile ic_atomic_uint32 *ptr, uint32 val) +{ + ptr->value = val; +} + +static inline uint32 +ic_atomic_read_u32(volatile ic_atomic_uint32 *ptr) +{ + return ptr->value; +} + +static inline void +ic_atomic_write_u32(volatile ic_atomic_uint32 *ptr, uint32 val) +{ + ptr->value = val; +} + +static inline bool +ic_atomic_compare_exchange_u32(volatile ic_atomic_uint32 *ptr, + uint32 *expected, uint32 newval) +{ + /* FIXME: we can probably use a lower consistency model */ + return __atomic_compare_exchange_n(&ptr->value, expected, newval, false, + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); +} + +static inline uint32 +ic_atomic_add_fetch_u32(volatile ic_atomic_uint32 *ptr, int32 add_) +{ + return __sync_fetch_and_add(&ptr->value, add_) + add_; +} + +static inline uint32 +ic_bswap32(uint32 x) +{ + return + ((x << 24) & 0xff000000) | + ((x << 8) & 0x00ff0000) | + ((x >> 8) & 0x0000ff00) | + ((x >> 24) & 0x000000ff); +} + +#define TIMEOUT_Z +#define RTT_SHIFT_ALPHA (3) /* srtt (0.125) */ +#define LOSS_THRESH (3) /* Packet loss triggers Karn */ +#define RTO_MIN (5000) /* MIN RTO(ms) */ +#define RTO_MAX (100000) /* MAX RTO(ms) */ +#define UDP_INFINITE_SSTHRESH 0x7fffffff + +#define SEC_TO_USEC(t) ((t) * 1000000) +#define SEC_TO_MSEC(t) ((t) * 1000) +#define MSEC_TO_USEC(t) ((t) * 1000) +#define USEC_TO_SEC(t) ((t) / 1000000) +#define TIME_TICK (1000000/HZ)/* in us */ + +#define UDP_INITIAL_RTO (MSEC_TO_USEC(200)) +#define UDP_DEFAULT_MSS 1460 + +#define RTO_HASH (3000) + +#define UDP_SEQ_LT(a,b) ((int32_t)((a)-(b)) < 0) +#define UDP_SEQ_LEQ(a,b) ((int32_t)((a)-(b)) <= 0) +#define UDP_SEQ_GT(a,b) ((int32_t)((a)-(b)) > 0) +#define UDP_SEQ_GEQ(a,b) ((int32_t)((a)-(b)) >= 0) + +#define UDP_RTO_MIN ((unsigned)(HZ/5)) + +struct UDPConn; +struct rto_hashstore +{ + uint32_t rto_now_idx; /* pointing the hs_table_s index */ + uint32_t rto_now_ts; + + TAILQ_HEAD(rto_head, UDPConn) rto_list[RTO_HASH + 1]; +}; + +struct mudp_manager +{ + struct rto_hashstore *rto_store; /* lists related to timeout */ + + int rto_list_cnt; + uint32_t cur_ts; +}; + +typedef struct mudp_manager* mudp_manager_t; + +#define MAX_TRY (11) +#define TIMEOUT(try) ((try) < MAX_TRY ? (timeoutArray[(try)]) : (timeoutArray[MAX_TRY])) + +#define USECS_PER_SECOND 1000000 +#define MSECS_PER_SECOND 1000 + +/* 1/4 sec in msec */ +#define RX_THREAD_POLL_TIMEOUT (250) + +/* + * Flags definitions for flag-field of UDP-messages + * + * We use bit operations to test these, flags are powers of two only + */ +#define UDPIC_FLAGS_RECEIVER_TO_SENDER (1) +#define UDPIC_FLAGS_ACK (2) +#define UDPIC_FLAGS_STOP (4) +#define UDPIC_FLAGS_EOS (8) +#define UDPIC_FLAGS_NAK (16) +#define UDPIC_FLAGS_DISORDER (32) +#define UDPIC_FLAGS_DUPLICATE (64) +#define UDPIC_FLAGS_CAPACITY (128) +#define UDPIC_FLAGS_FULL (256) + +#define UDPIC_MIN_BUF_SIZE (128 * 1024) + +/* + * ConnHtabBin + * + * A connection hash table bin. + * + */ +typedef struct ConnHtabBin ConnHtabBin; +struct ConnHtabBin +{ + UDPConn *conn; + struct ConnHtabBin *next; +}; + +/* + * ConnHashTable + * + * Connection hash table definition. + * + */ +typedef struct ConnHashTable ConnHashTable; +struct ConnHashTable +{ + ConnHtabBin **table; + int size; + + bool init(); + bool add(UDPConn *conn); + UDPConn *find(icpkthdr *hdr); + void destroy(); + void remove(UDPConn *conn); +}; + +#define CONN_HASH_VALUE(icpkt) ((uint32)((((icpkt)->srcPid ^ (icpkt)->dstPid)) + (icpkt)->dstContentId)) +#define CONN_HASH_MATCH(a, b) (((a)->motNodeId == (b)->motNodeId && \ + (a)->dstContentId == (b)->dstContentId && \ + (a)->srcContentId == (b)->srcContentId && \ + (a)->recvSliceIndex == (b)->recvSliceIndex && \ + (a)->sendSliceIndex == (b)->sendSliceIndex && \ + (a)->srcPid == (b)->srcPid && \ + (a)->dstPid == (b)->dstPid && (a)->icId == (b)->icId)) + +/* + * CursorICHistoryEntry + * + * The definition of cursor IC history entry. + */ +typedef struct CursorICHistoryEntry CursorICHistoryEntry; +struct CursorICHistoryEntry +{ + /* Interconnect instance id. */ + uint32 icId; + + /* Command id. */ + uint32 cid; + + /* + * Interconnect instance status. state 1 (value 1): interconnect is setup + * state 0 (value 0): interconnect was torn down. + */ + uint8 status; + + /* Next entry. */ + CursorICHistoryEntry *next; + + CursorICHistoryEntry(uint32 aicId, uint32 acid): + icId(aicId), cid(acid),status(1){} +}; + +/* + * CursorICHistoryTable + * + * Cursor IC history table. It is a small hash table. + */ +typedef struct CursorICHistoryTable CursorICHistoryTable; +struct CursorICHistoryTable +{ + uint32 size; + uint32 count; + CursorICHistoryEntry **table; + + void init() { + count = 0; + size = session_param.Gp_interconnect_cursor_ic_table_size; + table = (CursorICHistoryEntry **)ic_malloc0(sizeof(CursorICHistoryEntry *) * size); + } + + void add(uint32 icId, uint32 cid) { + uint32 index = icId % size; + CursorICHistoryEntry *p = new CursorICHistoryEntry(icId, cid); + + p->next = this->table[index]; + this->table[index] = p; + this->count++; + + LOG(DEBUG2, "add icid %d cid %d status %d", p->icId, p->cid, p->status); + + return; + } + + /* + * state 1 (value 1): interconnect is setup + * state 0 (value 0): interconnect was torn down. + */ + void update(uint32 icId, uint8 status) { + for (CursorICHistoryEntry *p = table[icId % size]; p; p = p->next) { + if (p->icId == icId) { + p->status = status; + return; + } + } + } + + CursorICHistoryEntry* get(uint32 icId) { + for (CursorICHistoryEntry *p = table[icId % size]; p; p = p->next) { + if (p->icId == icId) + return p; + } + return NULL; + } + + void purge() { + for (uint8 index = 0; index < size; index++) { + while (table[index]) { + CursorICHistoryEntry *trash = table[index]; + table[index] = trash->next; + delete trash; + } + } + } + + void prune(uint32 icId); +}; + +/* + * Synchronization timeout values + * + * MAIN_THREAD_COND_TIMEOUT - 1/4 second + */ +#define MAIN_THREAD_COND_TIMEOUT_MS (250) + +/* + * Used for synchronization between main thread (receiver) and background thread. + * + */ +typedef struct ThreadWaitingState ThreadWaitingState; +struct ThreadWaitingState +{ + bool waiting; + int waitingNode; + int waitingRoute; + int reachRoute; + + /* main_thread_waiting_query is needed to disambiguate for cursors */ + int waitingQuery; + + void reset() { + waiting = false; + waitingNode = -1; + waitingRoute = ANY_ROUTE; + reachRoute = ANY_ROUTE; + waitingQuery = -1; + } + + void set(int motNodeId, int route, int icId) { + waiting = true; + waitingNode = motNodeId; + waitingRoute = route; + reachRoute = ANY_ROUTE; + waitingQuery = icId; + } +}; + +/* + * ReceiveControlInfo + * + * The related control information for receiving data packets. + * Main thread (Receiver) and background thread use the information in + * this data structure to handle data packets. + * + */ +typedef struct ReceiveControlInfo ReceiveControlInfo; +struct ReceiveControlInfo +{ + /* Main thread waiting state. */ + ThreadWaitingState mainWaitingState; + + /* + * Buffers used to assemble disorder messages at receiver side. + */ + icpkthdr *disorderBuffer; + + /* The last interconnect instance id which is torn down. */ + int32 lastTornIcId; + + /* Cursor history table. */ + CursorICHistoryTable cursorHistoryTable; + + /* + * Last distributed transaction id when SetupUDPInterconnect is called. + * Coupled with cursorHistoryTable, it is used to handle multiple + * concurrent cursor cases. + */ + DistributedTransactionId lastDXatId; +}; + +/* + * RxBufferPool + * + * Receive thread buffer pool definition. The implementation of + * receive side buffer pool is different from send side buffer pool. + * It is because receive side buffer pool needs a ring buffer to + * easily implement disorder message handling logic. + */ + +typedef struct RxBufferPool RxBufferPool; +struct RxBufferPool +{ + /* The max number of buffers we can get from this pool. */ + int maxCount; + + /* The number of allocated buffers */ + int count; + + /* The list of free buffers. */ + char *freeList; + + void put(icpkthdr *buf); + void release(icpkthdr *buf); + icpkthdr* get(); + icpkthdr* get_free(); +}; + +/* + * SendBufferPool + * + * The send side buffer pool definition. + * + */ +typedef struct SendBufferPool SendBufferPool; +struct SendBufferPool +{ + /* The maximal number of buffers sender can use. */ + int maxCount; + + /* The number of buffers sender already used. */ + int count; + + /* The free buffer list at the sender side. */ + ICBufferList freeList; + + void init(); + void clean(); + ICBuffer* get(UDPConn *conn); +}; + +/* + * SendControlInfo + * + * The related control information for sending data packets and handling acks. + * Main thread use the information in this data structure to do ack handling + * and congestion control. + * + */ +typedef struct SendControlInfo SendControlInfo; +struct SendControlInfo +{ + /* The buffer used for accepting acks */ + icpkthdr *ackBuffer; + + /* congestion window */ + float cwnd; + + /* minimal congestion control window */ + float minCwnd; + + /* slow start threshold */ + float ssthresh; + +}; + +/* + * ICGlobalControlInfo + * + * Some shared control information that is used by main thread (senders, receivers, or both) + * and the background thread. + * + */ +typedef struct ICGlobalControlInfo ICGlobalControlInfo; +struct ICGlobalControlInfo +{ + /* The background thread handle. */ + pthread_t threadHandle; + + /* Keep the udp socket buffer size used. */ + uint32 socketSendBufferSize; + uint32 socketRecvBufferSize; + + uint64 lastExpirationCheckTime; + uint64 lastDeadlockCheckTime; + + /* Used to decide whether to retransmit for capacity based FC. */ + uint64 lastPacketSendTime; + + /* + * Lock and latch for coordination between main thread and + * background thread. It protects the shared data between the two threads + * (the connHtab, rx buffer pool and the mainWaitingState etc.). + */ + pthread_mutex_t lock; + + /* Am I a sender? */ + bool isSender; + + /* Flag showing whether the thread is created. */ + bool threadCreated; + + /* Error number. Actually int but we do not have ic_atomic_int32. */ + ic_atomic_uint32 eno; + + /* + * Global connection htab for both sending connections and receiving + * connections. Protected by the lock in this data structure. + */ + ConnHashTable connHtab; + + /* The connection htab used to cache future packets. */ + ConnHashTable startupCacheHtab; + + /* Used by main thread to ask the background thread to exit. */ + ic_atomic_uint32 shutdown; + + /*Serialization + * Used by ic thread in the QE to identify the current serving ic instance + * and handle the mismatch packets. It is not used by QD because QD may have + * cursors, QD may receive packets for open the cursors with lower instance + * id, QD use cursorHistoryTable to handle packets mismatch. + */ + int32 ic_instance_id; +}; + +/* + * Macro for unack queue ring, round trip time (RTT) and expiration period (RTO) + * + * UNACK_QUEUE_RING_SLOTS_NUM - the number of slots in the unack queue ring. + * this value should be greater than or equal to 2. + * TIMER_SPAN - timer period in us + * TIMER_CHECKING_PERIOD - timer checking period in us + * UNACK_QUEUE_RING_LENGTH - the whole time span of the unack queue ring + * DEFAULT_RTT - default rtt in us. + * MIN_RTT - min rtt in us + * MAX_RTT - max rtt in us + * RTT_SHIFT_COEFFICIENT - coefficient for RTT computation + * + * DEFAULT_DEV - default round trip standard deviation + * MAX_DEV - max dev + * DEV_SHIFT_COEFFICIENT - coefficient for DEV computation + * + * MAX_EXPIRATION_PERIOD - max expiration period in us + * MIN_EXPIRATION_PERIOD - min expiration period in us + * MAX_TIME_NO_TIMER_CHECKING - max time without checking timer + * DEADLOCK_CHECKING_TIME - deadlock checking time + * + * MAX_SEQS_IN_DISORDER_ACK - max number of sequences that can be transmitted in a + * disordered packet ack. + * + * + * Considerations on the settings of the values: + * + * TIMER_SPAN and UNACK_QUEUE_RING_SLOTS_NUM define the ring period. + * Currently, it is UNACK_QUEUE_RING_LENGTH (default 10 seconds). + * + * The definition of UNACK_QUEUE_RING_LENGTH is quite related to the size of + * sender side buffer and the size we may resend in a burst for an expiration event + * (which may overwhelm switch or OS if it is too large). + * Thus, we do not want to send too much data in a single expiration event. Here, a + * relatively large UNACK_QUEUE_RING_SLOTS_NUM value is used to avoid that. + * + * If the sender side buffer is X (MB), then on each slot, + * there are about X/UNACK_QUEUE_RING_SLOTS_NUM. Even we have a very large sender buffer, + * for example, 100MB, there is about 96M/2000 = 50K per slot. + * This is fine for the OS (with buffer 2M for each socket generally) and switch. + * + * Note that even when the buffers are not evenly distributed in the ring and there are some packet + * losses, the congestion control mechanism, the disorder and duplicate packet handling logic will + * assure the number of outstanding buffers (in unack queues) to be not very large. + * + * MIN_RTT/MAX_RTT/DEFAULT_RTT/MIN_EXPIRATION_PERIOD/MAX_EXPIRATION_PERIOD gives some heuristic values about + * the computation of RTT and expiration period. RTT and expiration period (RTO) are not + * constant for various kinds of hardware and workloads. Thus, they are computed dynamically. + * But we also want to bound the values of RTT and MAX_EXPIRATION_PERIOD. It is + * because there are some faults that may make RTT a very abnormal value. Thus, RTT and + * expiration period are upper and lower bounded. + * + * MAX_SEQS_IN_DISORDER_ACK should be smaller than (MIN_PACKET_SIZE - sizeof(icpkthdr))/sizeof(uint32). + * It is due to the limitation of the ack receive buffer size. + * + */ +#define UNACK_QUEUE_RING_SLOTS_NUM (2000) +#define TIMER_SPAN (session_param.Gp_interconnect_timer_period * 1000ULL) /* default: 5ms */ +#define TIMER_SPAN_LOSS (session_param.Gp_interconnect_timer_period * 500ULL) /* default: 5ms */ +#define TIMER_CHECKING_PERIOD (session_param.Gp_interconnect_timer_checking_period) /* default: 20ms */ +#define UNACK_QUEUE_RING_LENGTH (UNACK_QUEUE_RING_SLOTS_NUM * TIMER_SPAN) +#define UNACK_QUEUE_RING_LENGTH_LOSS (UNACK_QUEUE_RING_SLOTS_NUM * TIMER_SPAN_LOSS) + +#define DEFAULT_RTT (session_param.Gp_interconnect_default_rtt * 1000) /* default: 20ms */ +#define MIN_RTT (100) /* 0.1ms */ +#define MAX_RTT (200 * 1000) /* 200ms */ +#define RTT_SHIFT_COEFFICIENT (3) /* RTT_COEFFICIENT 1/8 (0.125) */ + +#define DEFAULT_DEV (0) +#define MIN_DEV MIN_RTT +#define MAX_DEV MAX_RTT +#define DEV_SHIFT_COEFFICIENT (2) /* DEV_COEFFICIENT 1/4 (0.25) */ + +#define MAX_EXPIRATION_PERIOD (1000 * 1000) /* 1s */ +#define MIN_EXPIRATION_PERIOD (session_param.Gp_interconnect_min_rto * 1000) /* default: 20ms */ + +#define MAX_TIME_NO_TIMER_CHECKING (50 * 1000) /* 50ms */ +#define DEADLOCK_CHECKING_TIME (512 * 1000) /* 512ms */ + +#define MAX_SEQS_IN_DISORDER_ACK (4) + +/* + * UnackQueueRing + * + * An unacked queue ring is used to decide which packet is expired in constant time. + * + * Each slot of the ring represents a fixed time span, for example 1ms, and + * each slot has a associated buffer list/queue which contains the packets + * which will expire in the time span. + * + * If the current time pointer (time t) points to slot 1, + * then slot 2 represents the time span from t + 1ms to t + 2ms. + * When we check whether there are some packets expired, we start from the last + * current time recorded, and resend all the packets in the queue + * until we reach the slot that the updated current time points to. + * + */ +typedef struct UnackQueueRing UnackQueueRing; +struct UnackQueueRing +{ + /* save the current time when we check the time wheel for expiration */ + uint64 currentTime; + + /* the slot index corresponding to current time */ + int idx; + + /* the number of outstanding packets in unack queue ring */ + int numOutStanding; + + /* + * the number of outstanding packets that use the shared bandwidth in the + * congestion window. + */ + int numSharedOutStanding; + + /* time slots */ + ICBufferList slots[UNACK_QUEUE_RING_SLOTS_NUM]; +#ifdef TIMEOUT_Z + uint32_t retrans_count; + uint32_t no_retrans_count; + uint32_t time_difference; + uint32_t min; + uint32_t max; +#endif +}; + +/* + * AckSendParam + * + * The parameters for ack sending. + */ +typedef struct AckSendParam +{ + /* header for the ack */ + icpkthdr msg; + + /* peer address for the ack */ + struct sockaddr_storage peer; + socklen_t peer_len; +} AckSendParam; + +/* + * ICStatistics + * + * A structure keeping various statistics about interconnect internal. + * + * Note that the statistics for ic are not accurate for multiple cursor case on QD. + * + * totalRecvQueueSize - receive queue size sum when main thread is trying to get a packet. + * recvQueueSizeCountingTime - counting times when computing totalRecvQueueSize. + * totalCapacity - the capacity sum when packets are tried to be sent. + * capacityCountingTime - counting times used to compute totalCapacity. + * totalBuffers - total buffers available when sending packets. + * bufferCountingTime - counting times when compute totalBuffers. + * activeConnectionsNum - the number of active connections. + * retransmits - the number of packet retransmits. + * mismatchNum - the number of mismatched packets received. + * crcErrors - the number of crc errors. + * sndPktNum - the number of packets sent by sender. + * recvPktNum - the number of packets received by receiver. + * disorderedPktNum - disordered packet number. + * duplicatedPktNum - duplicate packet number. + * recvAckNum - the number of Acks received. + * statusQueryMsgNum - the number of status query messages sent. + * + */ +typedef struct ICStatistics +{ + uint64 totalRecvQueueSize; + uint64 recvQueueSizeCountingTime; + uint64 totalCapacity; + uint64 capacityCountingTime; + uint64 totalBuffers; + uint64 bufferCountingTime; + uint32 activeConnectionsNum; + int32 retransmits; + int32 startupCachedPktNum; + int32 mismatchNum; + int32 crcErrors; + int32 sndPktNum; + int32 recvPktNum; + int32 disorderedPktNum; + int32 duplicatedPktNum; + int32 recvAckNum; + int32 statusQueryMsgNum; +} ICStatistics; + +struct TransportEntry; + +struct udp_send_vars +{ + /* send sequence variables */ + uint32_t snd_una; /* send unacknoledged */ + uint32_t snd_wnd; /* send window (unscaled) */ + + /* retransmission timeout variables */ + uint8_t nrtx; /* number of retransmission */ + uint8_t max_nrtx; /* max number of retransmission */ + uint32_t rto; /* retransmission timeout */ + uint32_t ts_rto; /* timestamp for retransmission timeout */ + + /* congestion control variables */ + uint32_t cwnd; /* congestion window */ + uint32_t ssthresh; /* slow start threshold */ + + TAILQ_ENTRY(UDPConn) send_link; + TAILQ_ENTRY(UDPConn) timer_link; /* timer link (rto list) */ + +}; + +/* + * Structure used for keeping track of a pt-to-pt connection between two + * Cdb Entities (either QE or QD). + */ +struct UDPConn : public MotionConn +{ +public: + /* send side queue for packets to be sent */ + ICBufferList sndQueue; + int capacity; + + /* seq already sent */ + uint32 sentSeq; + + /* ack of this seq and packets with smaller seqs have been received */ + uint32 receivedAckSeq; + + /* packets with this seq or smaller seqs have been consumed */ + uint32 consumedSeq; + + uint64 rtt; + uint64 dev; + uint64 deadlockCheckBeginTime; + + ICBuffer *curBuff; + + /* + * send side unacked packet queue. Since it is often accessed at the same + * time with unack queue ring, it is protected with unqck queue ring lock. + */ + ICBufferList unackQueue; + + uint16 route; + + struct icpkthdr conn_info; + + struct sockaddr_storage peer; /* Allow for IPv4 or IPv6 */ + socklen_t peer_len; /* And remember the actual length */ + + /* a queue of maximum length Gp_interconnect_queue_depth */ + uint32 pkt_q_capacity; /* max capacity of the queue */ + uint32 pkt_q_size; /* number of packets in the queue */ + int pkt_q_head; + int pkt_q_tail; + uint8 **pkt_q; + + uint64 stat_total_ack_time; + uint64 stat_count_acks; + uint64 stat_max_ack_time; + uint64 stat_min_ack_time; + uint64 stat_count_resent; + uint64 stat_max_resent; + uint64 stat_count_dropped; + + struct { + uint32_t ts_rto; + uint32_t rto; + uint32_t srtt; + uint32_t rttvar; + uint32_t snd_una; + uint16_t nrtx; + uint16_t max_nrtx; + uint32_t mss; + uint32_t cwnd; + uint32_t ssthresh; + uint32_t fss; + uint8_t loss_count; + uint32_t mdev; + uint32_t mdev_max; + uint32_t rtt_seq; /* sequence number to update rttvar */ + uint32_t ts_all_rto; + bool karn_mode; + } rttvar; + + uint8_t on_timewait_list; + int16_t on_rto_idx; + + uint32_t snd_nxt; /* send next */ + struct udp_send_vars sndvar; + + TransportEntry *entry_; + +public: + UDPConn(TransportEntry *entry); + + void GetDataInBuf(GetDataLenInPacket getLen, DataBlock *data); + void ReleaseBuffer(AckSendParam *param); + + void setAckParam(AckSendParam *param, int32 flags, uint32 seq, uint32 extraSeq); + void sendAck(int32 flags, uint32 seq, uint32 extraSeq); + void sendDisorderAck(uint32 seq, uint32 extraSeq, uint32 lostPktCnt); + void sendStatusQueryMessage(uint32 seq); + + uint64 computeExpirationPeriod(uint32 retry); + + void freeDisorderedPackets(); + void prepareRxConnForRead(); + void DeactiveConn(); + + void handleAckedPacket(ICBuffer *buf, uint64 now, struct icpkthdr *pkt); + void prepareXmit(); + void sendOnce(icpkthdr *pkt); + void handleStop(); + void sendBuffers(); + + void handleDisorderPacket(int pos, uint32 tailSeq, icpkthdr *pkt); + bool handleAckForDisorderPkt(icpkthdr *pkt); + bool handleAckForDuplicatePkt(icpkthdr *pkt); + int computeTimeout(int retry); + + void Send(DataBlock *data); + + void checkDeadlock(); + void checkExceptions(int retry, int timeout); + + void updateRetransmitStatistics(); + void checkExpirationCapacityFC(int timeout); + void checkExpiration(ICChunkTransportState *transportStates, uint64 now); + + static void checkNetworkTimeout(ICBuffer *buf, uint64 now, bool *networkTimeoutIsLogged); + + static void sendAckWithParam(AckSendParam *param); + static void sendControlMessage(icpkthdr *pkt, int fd, struct sockaddr *addr, socklen_t peerLen); +}; + + +/* + * Used to organize all of the information for a given motion node. + */ +struct CChunkTransportStateEntry +{ + int motNodeId; + bool valid; + + /* Connection array + * + * MUST pay attention: use getMotionConn to get MotionConn. + * must not use `->conns[index]` to get MotionConn. Because the struct + * MotionConn is a base structure for MotionConnTCP and + * MotionConnUDP. After connection setup, the `conns` will be fill + * with MotionConnUDP/MotionConnTCP, but the pointer still is + * MotionConn which should use `CONTAINER_OF` to get the real object. + */ + MotionConn *conns; + int numConns; + + int scanStart; + + /* slice table entries */ + struct ICExecSlice *sendSlice; + struct ICExecSlice *recvSlice; +}; + +class CChunkTransportStateImpl; + +class TransportEntry : public CChunkTransportStateEntry +{ +public: + static std::unique_ptr + MakeRecvEntry(CChunkTransportStateImpl *state, int icid, ICExecSlice *sendSlice, ICExecSlice *recvSlice); + + static std::unique_ptr + MakeSendEntry(CChunkTransportStateImpl *state, int icid, ICExecSlice *sendSlice, ICExecSlice *recvSlice); + + TransportEntry(CChunkTransportStateImpl *state, int motNodeID, int numConns, ICExecSlice *sendSlice, ICExecSlice *recvSlice); + + UDPConn* GetConn(int index); + + void aggregateStatistics(); + + bool handleAcks(bool need_flush); + void handleStopMsgs(); + + bool pollAcks(int timeout); + + void dumpConnections(const char *fname); + + bool SendData(int16 targetRoute, DataBlock *pblocks, int num, bool broadcast); + void Broadcast(DataBlock *data, int *inactiveCountPtr); + void Send(int16 targetRoute, DataBlock *db, bool broadcast, int *recount); + + void RecvAny(int16 *srcRoute, GetDataLenInPacket getLen, DataBlock *data); + void RecvRoute(int16 srcRoute, GetDataLenInPacket getLen, DataBlock *data); + void receiveChunksUDPIFC(int16 *srcRoute, UDPConn *conn, GetDataLenInPacket getLen, DataBlock *data); + +public: + /* setup info */ + int txfd; + int txfd_family; + unsigned short txport; + + bool sendingEos; + + /* Statistics info for this motion on the interconnect level */ + uint64 stat_total_ack_time; + uint64 stat_count_acks; + uint64 stat_max_ack_time; + uint64 stat_min_ack_time; + uint64 stat_count_resent; + uint64 stat_max_resent; + uint64 stat_count_dropped; + + std::vector> conns_; + CChunkTransportStateImpl *state; +}; + + +class CChunkTransportStateImpl : public CChunkTransportState +{ +public: + CChunkTransportStateImpl(ICSliceTable *sliceTable); + + static ICChunkTransportState* SetupUDP(ICSliceTable *sliceTable, SessionMotionLayerIPCParam *param); + void TeardownUDP(bool hasErrors); + + void RecvRoute(int16 motNodeID, int16 srcRoute, GetDataLenInPacket getLen, DataBlock *data); + void RecvAny(int16 motNodeID, int16 *srcRoute, GetDataLenInPacket getLen, DataBlock *data); + void SendStop(int16 motNodeID); + void ReleaseAndAck(int motNodeID, int route); + void DeactiveRoute(int motNodeID, int srcRoute, const char *reason); + + void SendEOS(int motNodeID, DataBlock *data); + bool SendData(int16 motNodeID, int16 targetRoute, DataBlock *pblocks, int num, bool broadcast); + void GetFreeSpace(int16 motNodeID, int16 targetRoute, BufferBlock *b); + void ReduceFreeSpace(int16 motNodeID, int16 targetRoute, int length); + + void* GetOpaqueDataInConn(int16 motNodeID, int16 targetRoute); + int32* GetSentRecordTypmodInConn(int16 motNodeID, int16 targetRoute); + + int GetConnNum(int motNodeID); + + TransportEntry* GetEntry(int motNodeID, bool checkValid); + + static CChunkTransportState **GetTransportState(); + + /* APIs for vector engine */ + void NotifyQuit(); + void SetVectorEngineAsUser(); + +private: + void checkMotNodeID(int sendMotNodeID); + void CreateRecvEntries(ICSliceTable *sliceTable); + void CreateSendEntries(ICSliceTable *sliceTable); + void DestroyRecvEntries(bool *isReceiver); + void DestroySendEntries(); + static ICChunkTransportState* setup(ICSliceTable *sliceTable); + void teardown(bool hasErrors); + + std::vector> entries_; + + static CChunkTransportState *state_; +}; + +} // namespace + +#endif /* IC_UDP2_INTERNAL_HPP */ \ No newline at end of file diff --git a/contrib/udp2/ic_modules.c b/contrib/udp2/ic_modules.c new file mode 100644 index 00000000000..47843f08c99 --- /dev/null +++ b/contrib/udp2/ic_modules.c @@ -0,0 +1,88 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_modules.c + * + * IDENTIFICATION + * contrib/udp2/ic_modules.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "cdb/ml_ipc.h" +#include "ic_modules.h" +#include "ic_udp2.h" + +PG_MODULE_MAGIC; + +static +MotionIPCLayer udp2_ipc_layer = { + .ic_type = INTERCONNECT_TYPE_UDP2, + .type_name = "udp2", + + .GetMaxTupleChunkSize = GetMaxTupleChunkSizeUDP2, + .GetListenPort = GetListenPortUDP2, + + .InitMotionLayerIPC = InitMotionIPCLayerUDP2, + .CleanUpMotionLayerIPC = CleanUpMotionLayerIPCUDP2, + .WaitInterconnectQuit = WaitInterconnectQuitUDPIFC2, + .SetupInterconnect = SetupInterconnectUDP2, + .TeardownInterconnect = TeardownInterconnectUDP2, + + .SendTupleChunkToAMS = SendTupleChunkToAMSUDP2, + .SendChunk = NULL, + .SendEOS = SendEOSUDPIFC2, + .SendStopMessage = SendStopMessageUDPIFC2, + + .RecvTupleChunkFromAny = RecvTupleChunkFromAnyUDPIFC2, + .RecvTupleChunkFrom = RecvTupleChunkFromUDPIFC2, + .RecvTupleChunk = NULL, + + .DirectPutRxBuffer = MlPutRxBufferUDPIFC2, + + .DeregisterReadInterest = DeregisterReadInterestUDP2, + .GetActiveMotionConns = GetActiveMotionConnsUDPIFC2, + + .GetTransportDirectBuffer = GetTransportDirectBufferUDPIFC2, + .PutTransportDirectBuffer = PutTransportDirectBufferUDPIFC2, + +#ifdef ENABLE_IC_PROXY + .IcProxyServiceMain = ic_proxy_server_main, +#else + .IcProxyServiceMain = NULL, +#endif + + .GetMotionConnTupleRemapper = GetMotionConnTupleRemapperUDPIFC2, + .GetMotionSentRecordTypmod = GetMotionSentRecordTypmodUDPIFC2, +}; + +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not load udp2 outside process shared preload"))); + } + + RegisterIPCLayerImpl(&udp2_ipc_layer); +} \ No newline at end of file diff --git a/contrib/udp2/ic_modules.h b/contrib/udp2/ic_modules.h new file mode 100644 index 00000000000..a05b65894f3 --- /dev/null +++ b/contrib/udp2/ic_modules.h @@ -0,0 +1,32 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_modules.h + * + * IDENTIFICATION + * contrib/udp2/ic_modules.h + * + *------------------------------------------------------------------------- + */ +#ifndef INTER_CONNECT_H +#define INTER_CONNECT_H + +extern void _PG_init(void); + +#endif // INTER_CONNECT_H \ No newline at end of file diff --git a/contrib/udp2/ic_udp2.c b/contrib/udp2/ic_udp2.c new file mode 100644 index 00000000000..0972c874941 --- /dev/null +++ b/contrib/udp2/ic_udp2.c @@ -0,0 +1,1028 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_udp2.c + * + * IDENTIFICATION + * contrib/udp2/ic_udp2.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "cdb/cdbdisp.h" +#include "cdb/cdbgang.h" +#include "cdb/cdbmotion.h" +#include "cdb/cdbvars.h" +#include "cdb/tupchunklist.h" +#include "postmaster/bgworker.h" +#include "postmaster/postmaster.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "tcop/tcopprot.h" +#include "utils/wait_event.h" +#include "utils/memutils.h" + +/* local interconnect */ +#include "ic_udp2.h" + +/* from ic_common packeage */ +#include "ic_types.h" +#include "udp2/ic_udp2.h" + +#define MAX_QUEUE_SIZE (64) + +#define HandleLastError() \ +do { \ + ICError *error = GetLastError(); \ + Assert(error); \ + if (error->level == LEVEL_ERROR) \ + { \ + Assert(error->msg); \ + elog(ERROR, "%s", error->msg); \ + } \ + if (error->level == LEVEL_FATAL) \ + { \ + Assert(error->msg); \ + elog(FATAL, "%s", error->msg); \ + } \ +} while (0) + +#define ML_CHECK_FOR_INTERRUPTS(teardownActive) \ + do {if (!teardownActive && InterruptPending) CHECK_FOR_INTERRUPTS(); } while (0) + +/* + * Resource manager + */ +typedef void (*TeardownInterconnectCallBack)(ChunkTransportState *transportStates, bool hasErrors); +typedef struct interconnect_handle_t +{ + ChunkTransportState *interconnect_context; /* Interconnect state */ + + // callback for interconnect been abort + TeardownInterconnectCallBack teardown_cb; + + ResourceOwner owner; /* owner of this handle */ + struct interconnect_handle_t *next; + struct interconnect_handle_t *prev; +} interconnect_handle_t; + +static interconnect_handle_t * open_interconnect_handles; +static bool interconnect_resowner_callback_registered; + +static void destroy_interconnect_handle(interconnect_handle_t *h); +static interconnect_handle_t * allocate_interconnect_handle(TeardownInterconnectCallBack callback); +static interconnect_handle_t * find_interconnect_handle(ChunkTransportState *icContext); + + +static void SetupGlobalMotionLayerIPCParam(GlobalMotionLayerIPCParam *param); +static void SetupSessionMotionLayerIPCParam(SessionMotionLayerIPCParam *param); +static bool CheckPostmasterIsAlive(void); +static void CheckCancelOnQD(ICChunkTransportState *pTransportStates); +static void CheckInterrupts(int teardownActive); +static void SimpleFaultInjector(const char *faultname); +static void *CreateOpaqueData(void); +static void DestroyOpaqueData(void **opaque); +static ICSliceTable* ConvertToICSliceTable(SliceTable *tbl); +static TupleChunkListItem ConvertToTupleChunk(ChunkTransportState *transportStates, DataBlock *data); +static ChunkTransportState *CreateChunkTransportState(EState *estate, ICChunkTransportState *udp2_state); + + +int +GetMaxTupleChunkSizeUDP2(void) +{ + int header_size = UDP2_GetICHeaderSizeUDP(); + return Gp_max_packet_size - header_size - TUPLE_CHUNK_HEADER_SIZE; +} + +int32 +GetListenPortUDP2(void) +{ + return UDP2_GetListenPortUDP(); +} + +void +InitMotionIPCLayerUDP2(void) +{ + GlobalMotionLayerIPCParam param; + SetupGlobalMotionLayerIPCParam(¶m); + + param.checkPostmasterIsAliveCallback = CheckPostmasterIsAlive; + param.checkInterruptsCallback = CheckInterrupts; + param.simpleFaultInjectorCallback = SimpleFaultInjector; + + param.createOpaqueDataCallback = CreateOpaqueData; + param.destroyOpaqueDataCallback = DestroyOpaqueData; + + param.checkCancelOnQDCallback = CheckCancelOnQD; + + ResetLastError(); + UDP2_InitUDPIFC(¶m); + HandleLastError(); +} + +void +CleanUpMotionLayerIPCUDP2(void) +{ + if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) + elog(DEBUG3, "Cleaning Up Motion Layer IPC..."); + + ResetLastError(); + UDP2_CleanUpUDPIFC(); + HandleLastError(); +} + +void +WaitInterconnectQuitUDPIFC2(void) +{ + ResetLastError(); + UDP2_WaitQuitUDPIFC(); + HandleLastError(); +} + +void +SetupInterconnectUDP2(EState *estate) +{ + int32 sliceNum = 0; + int32 calcQueueDepth = 0; + int32 calcSndDepth = 0; + + if (estate->interconnect_context) + elog(ERROR, "SetupInterconnectUDP: already initialized."); + + if (!estate->es_sliceTable) + elog(ERROR, "SetupInterconnectUDP: no slice table ?"); + + if (estate != NULL && estate->es_sliceTable != NULL) + sliceNum = estate->es_sliceTable->numSlices; + else + sliceNum = 1; + + if (Gp_interconnect_mem_size > 0 && + Gp_interconnect_queue_depth == 4 && + Gp_interconnect_snd_queue_depth == 2) + { + int32 perQueue = Gp_interconnect_mem_size / + (Gp_max_packet_size * sliceNum); + + calcSndDepth = Max(Gp_interconnect_snd_queue_depth, perQueue / 2); + calcQueueDepth = Max(Gp_interconnect_queue_depth, perQueue - calcSndDepth); + + if (calcSndDepth > MAX_QUEUE_SIZE) + calcSndDepth = MAX_QUEUE_SIZE; + + if (calcQueueDepth > MAX_QUEUE_SIZE) + calcQueueDepth = MAX_QUEUE_SIZE; + + Gp_interconnect_snd_queue_depth = calcSndDepth; + Gp_interconnect_queue_depth = calcQueueDepth; + + elog(DEBUG1, "SetupUDPIFCInterconnect: queue depth, " + "queue_depth=%d, snd_queue_depth=%d, " + "mem_size=%d, slices=%d, packet_size=%d", + Gp_interconnect_queue_depth, + Gp_interconnect_snd_queue_depth, + Gp_interconnect_mem_size, + sliceNum, + Gp_max_packet_size); + } + + SessionMotionLayerIPCParam param; + SetupSessionMotionLayerIPCParam(¶m); + + interconnect_handle_t *h; + h = allocate_interconnect_handle(TeardownInterconnectUDP2); + + ICSliceTable *tbl = ConvertToICSliceTable(estate->es_sliceTable); + + ResetLastError(); + ICChunkTransportState *udp2_state = UDP2_SetupUDP(tbl, ¶m); + HandleLastError(); + + Assert(udp2_state); + ChunkTransportState *state = CreateChunkTransportState(estate, udp2_state); + h->interconnect_context = state; + + h->interconnect_context->estate = estate; + estate->interconnect_context = h->interconnect_context; + estate->es_interconnect_is_setup = true; + + /* Check if any of the QEs has already finished with error */ + if (Gp_role == GP_ROLE_DISPATCH) + { + ChunkTransportState *pTransportStates = h->interconnect_context; + + Assert(pTransportStates); + Assert(pTransportStates->estate); + + if (cdbdisp_checkForCancel(pTransportStates->estate->dispatcherState)) + { + ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg(CDB_MOTION_LOST_CONTACT_STRING))); + /* not reached */ + } + } +} + +void +TeardownInterconnectUDP2(ChunkTransportState *transportStates, bool hasErrors) +{ + if (transportStates == NULL || transportStates->sliceTable == NULL) + { + elog(LOG, "TeardownUDPIFCInterconnect: missing slice table."); + return; + } + + /* TODO: should pass interconnect_handle_t as arg? */ + interconnect_handle_t *h = find_interconnect_handle(transportStates); + + ResetLastError(); + HOLD_INTERRUPTS(); + + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + UDP2_TeardownUDP(udp2_state, hasErrors); + + transportStates->activated = false; + transportStates->sliceTable = NULL; + + RESUME_INTERRUPTS(); + HandleLastError(); + + if (h != NULL) + destroy_interconnect_handle(h); +} + +bool +SendTupleChunkToAMSUDP2(ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + TupleChunkListItem tcItem) +{ + if (!transportStates) + { + elog(FATAL, "SendTupleChunkToAMS: no transport-states."); + } + + if (!transportStates->activated) + { + elog(FATAL, "SendTupleChunkToAMS: transport states inactive"); + } + + /* check em' */ + ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); + +#ifdef AMS_VERBOSE_LOGGING + elog(DEBUG3, "sendtuplechunktoams: calling get_transport_state" + "w/transportStates %p transportState->size %d motnodeid %d route %d", + transportStates, transportStates->size, motNodeID, targetRoute); +#endif + + /* get the number of TupleChunkListItem */ + int num = 0; + TupleChunkListItem item = tcItem; + while (item) + { + num++; + item = item->p_next; + } + + /* convert to DataBlock */ + DataBlock *pblocks = (DataBlock *)palloc0(sizeof(DataBlock) * num); + item = tcItem; + for (int i = 0; i < num; ++i) + { + pblocks[i].pos = item->chunk_data; + pblocks[i].len = item->chunk_length; + + item = item->p_next; + } + + bool broadcast = (targetRoute == BROADCAST_SEGIDX); + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + bool rs = UDP2_SendData(udp2_state, + motNodeID, + targetRoute, + pblocks, + num, + broadcast); + HandleLastError(); + + return rs; +} + +void +SendEOSUDPIFC2(ChunkTransportState *transportStates, + int motNodeID, + TupleChunkListItem tcItem) +{ + if (!transportStates) + { + elog(FATAL, "SendEOSUDPIFC: missing interconnect context."); + } + else if (!transportStates->activated && !transportStates->teardownActive) + { + elog(FATAL, "SendEOSUDPIFC: context and teardown inactive."); + } + + /* check em' */ + ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); + + DataBlock db; + db.pos = tcItem->chunk_data; + db.len = tcItem->chunk_length; + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + UDP2_SendEOS(udp2_state, motNodeID, &db); + HandleLastError(); +} + +void +SendStopMessageUDPIFC2(ChunkTransportState *transportStates, int16 motNodeID) +{ + if (!transportStates->activated) + return; + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + UDP2_SendStop(udp2_state, motNodeID); + HandleLastError(); +} + +TupleChunkListItem +RecvTupleChunkFromAnyUDPIFC2(ChunkTransportState *transportStates, + int16 motNodeID, + int16 *srcRoute) +{ + if (!transportStates) + { + elog(FATAL, "RecvTupleChunkFromAnyUDPIFC: missing context"); + } + else if (!transportStates->activated) + { + elog(FATAL, "RecvTupleChunkFromAnyUDPIFC: interconnect context not active!"); + } + + DataBlock db = {NULL, 0}; + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + UDP2_RecvAny(udp2_state, motNodeID, srcRoute, NULL, &db); + HandleLastError(); + + if (db.pos == NULL) + return NULL; + + return ConvertToTupleChunk(transportStates, &db); +} + +TupleChunkListItem +RecvTupleChunkFromUDPIFC2(ChunkTransportState *transportStates, + int16 motNodeID, + int16 srcRoute) +{ + if (!transportStates) + { + elog(FATAL, "RecvTupleChunkFromUDPIFC: missing context"); + } + else if (!transportStates->activated) + { + elog(FATAL, "RecvTupleChunkFromUDPIFC: interconnect context not active!"); + } + +#ifdef AMS_VERBOSE_LOGGING + elog(LOG, "RecvTupleChunkFromUDPIFC()."); +#endif + + /* check em' */ + ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); + +#ifdef AMS_VERBOSE_LOGGING + elog(DEBUG5, "RecvTupleChunkFromUDPIFC(motNodID=%d, srcRoute=%d)", motNodeID, srcRoute); +#endif + + DataBlock db = {NULL, 0}; + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + UDP2_RecvRoute(udp2_state, motNodeID, srcRoute, NULL, &db); + HandleLastError(); + + if (db.pos == NULL) + return NULL; + + return ConvertToTupleChunk(transportStates, &db); +} + +void +MlPutRxBufferUDPIFC2(ChunkTransportState *transportStates, int motNodeID, int route) +{ + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + UDP2_ReleaseAndAck(udp2_state, motNodeID, route); + HandleLastError(); +} + +void +DeregisterReadInterestUDP2(ChunkTransportState *transportStates, + int motNodeID, + int srcRoute, + const char *reason) +{ + if (!transportStates) + { + elog(FATAL, "DeregisterReadInterestUDP: no transport states"); + } + + if (!transportStates->activated) + return; + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + UDP2_DeactiveRoute(udp2_state, motNodeID, srcRoute, reason); + HandleLastError(); +} + +uint32 +GetActiveMotionConnsUDPIFC2(void) +{ + return UDP2_GetActiveConns(); +} + +void +GetTransportDirectBufferUDPIFC2(ChunkTransportState * transportStates, + int16 motNodeID, + int16 targetRoute, + struct directTransportBuffer *b) +{ + if (!transportStates) + { + elog(FATAL, "GetTransportDirectBuffer: no transport states"); + } + else if (!transportStates->activated) + { + elog(FATAL, "GetTransportDirectBuffer: inactive transport states"); + } + else if (targetRoute == BROADCAST_SEGIDX) + { + elog(FATAL, "GetTransportDirectBuffer: can't direct-transport to broadcast"); + } + + BufferBlock buf = {NULL, 0}; + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + UDP2_GetFreeSpace(udp2_state, motNodeID, targetRoute, &buf); + HandleLastError(); + + b->pri = buf.pos; + b->prilen = buf.len; +} + +void +PutTransportDirectBufferUDPIFC2(ChunkTransportState * transportStates, + int16 motNodeID, + int16 targetRoute, + int length) +{ + if (!transportStates) + { + elog(FATAL, "PutTransportDirectBuffer: no transport states"); + } + else if (!transportStates->activated) + { + elog(FATAL, "PutTransportDirectBuffer: inactive transport states"); + } + else if (targetRoute == BROADCAST_SEGIDX) + { + elog(FATAL, "PutTransportDirectBuffer: can't direct-transport to broadcast"); + } + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + UDP2_ReduceFreeSpace(udp2_state, motNodeID, targetRoute, length); + HandleLastError(); +} + +TupleRemapper* +GetMotionConnTupleRemapperUDPIFC2(ChunkTransportState * transportStates, + int16 motNodeID, + int16 targetRoute) +{ + TupleRemapper *remapper = NULL; + + if (!transportStates) + { + elog(FATAL, "GetMotionConnTupleRemapper: no transport states"); + } + + if (!transportStates->activated) + { + elog(FATAL, "GetMotionConnTupleRemapper: inactive transport states"); + } + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + remapper = (TupleRemapper *)UDP2_GetOpaqueDataInConn(udp2_state, motNodeID, targetRoute); + HandleLastError(); + + return remapper; +} + +int32* +GetMotionSentRecordTypmodUDPIFC2(ChunkTransportState * transportStates, + int16 motNodeID, + int16 targetRoute) +{ + int32 *rs = NULL; + + if (!transportStates) + { + elog(FATAL, "GetMotionConnTupleRemapper: no transport states"); + } + + if (!transportStates->activated) + { + elog(FATAL, "GetMotionConnTupleRemapper: inactive transport states"); + } + + targetRoute = targetRoute == BROADCAST_SEGIDX ? 0 : targetRoute; + + ResetLastError(); + ICChunkTransportState *udp2_state = + (ICChunkTransportState *)transportStates->implement_state; + rs = UDP2_GetSentRecordTypmodInConn(udp2_state, motNodeID, targetRoute); + HandleLastError(); + + return rs; +} + +static void +SetupGlobalMotionLayerIPCParam(GlobalMotionLayerIPCParam *param) +{ + if (param == NULL) + return; + + param->interconnect_address = interconnect_address; + param->Gp_role = Gp_role; + param->ic_htab_size = ic_htab_size; + param->segment_number = getgpsegmentCount(); + param->MyProcPid = MyProcPid; + param->dbid = GpIdentity.dbid; + param->segindex = GpIdentity.segindex; + param->MyProcPort = MyProcPort != NULL; + param->myprocport_sock = param->MyProcPort ? MyProcPort->sock : -1; + param->Gp_max_packet_size = Gp_max_packet_size; + param->Gp_udp_bufsize_k = Gp_udp_bufsize_k; + param->Gp_interconnect_address_type = Gp_interconnect_address_type; +} + +static bool +CheckPostmasterIsAlive(void) +{ + return PostmasterIsAlive(); +} + +static void +CheckCancelOnQD(ICChunkTransportState *state) +{ + int nevent = 0, nrevent = 0; + int *waitFds = NULL; + WaitEvent *rEvents = NULL; + WaitEventSet *waitset = NULL; + ChunkTransportState *pTransportStates = NULL; + + if (Gp_role != GP_ROLE_DISPATCH) + return; + + pTransportStates = (ChunkTransportState *)state->clientState; + + /* get all wait sock fds */ + waitFds = cdbdisp_getWaitSocketFds(pTransportStates->estate->dispatcherState, &nevent); + if (waitFds == NULL) + return; + + /* init WaitEventSet */ + waitset = CreateWaitEventSet(CurrentMemoryContext, nevent); + rEvents = palloc(nevent * sizeof(WaitEvent)); /* returned events */ + for (int i = 0; i < nevent; ++i) + AddWaitEventToSet(waitset, WL_SOCKET_READABLE, waitFds[i], NULL, NULL); + + /* wait for event from QE */ + nrevent = WaitEventSetWait(waitset, 0, rEvents, nevent, WAIT_EVENT_INTERCONNECT); + + /* check to see if the dispatcher should cancel */ + for (int i = 0; i < nrevent; i++) + { + if (rEvents[i].events & WL_SOCKET_READABLE) + { + /* event happened on wait fds, need to check cancel */ + Assert(pTransportStates); + Assert(pTransportStates->estate); + + if (cdbdisp_checkForCancel(pTransportStates->estate->dispatcherState)) + { + ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg(CDB_MOTION_LOST_CONTACT_STRING))); + /* not reached */ + } + break; + } + } + + if (waitset) + FreeWaitEventSet((WaitEventSet *)waitset); + if (rEvents) + pfree(rEvents); +} + +static void +CheckInterrupts(int teardownActive) +{ + ML_CHECK_FOR_INTERRUPTS(teardownActive); +} + +static void +SimpleFaultInjector(const char *faultname) +{ + SIMPLE_FAULT_INJECTOR((faultname)); +} + +static void * +CreateOpaqueData(void) +{ + return CreateTupleRemapper(); +} + +static void +DestroyOpaqueData(void **opaque) +{ + if (*opaque == NULL) + return; + *opaque = NULL; +} + +static void +SetupSessionMotionLayerIPCParam(SessionMotionLayerIPCParam *param) +{ + if (param == NULL) + return; + + TransactionId localTransId = 0; + TransactionId subtransId = 0; + + param->Gp_interconnect_queue_depth = Gp_interconnect_queue_depth; + param->Gp_interconnect_snd_queue_depth = Gp_interconnect_snd_queue_depth; + param->Gp_interconnect_timer_period = Gp_interconnect_timer_period; + param->Gp_interconnect_timer_checking_period = Gp_interconnect_timer_checking_period; + param->Gp_interconnect_default_rtt = Gp_interconnect_default_rtt; + param->Gp_interconnect_min_rto = Gp_interconnect_min_rto; + param->Gp_interconnect_transmit_timeout = Gp_interconnect_transmit_timeout; + param->Gp_interconnect_min_retries_before_timeout = Gp_interconnect_min_retries_before_timeout; + param->Gp_interconnect_debug_retry_interval = Gp_interconnect_debug_retry_interval; + param->gp_interconnect_full_crc = gp_interconnect_full_crc; + param->gp_interconnect_aggressive_retry = gp_interconnect_aggressive_retry; + param->gp_interconnect_cache_future_packets = gp_interconnect_cache_future_packets; + param->gp_interconnect_log_stats = gp_interconnect_log_stats; + param->interconnect_setup_timeout = interconnect_setup_timeout; + param->gp_log_interconnect = gp_log_interconnect; + param->gp_session_id = gp_session_id; + param->Gp_interconnect_fc_method = Gp_interconnect_fc_method; + param->gp_command_count = gp_command_count; + param->gp_interconnect_id = gp_interconnect_id; + param->log_min_messages = log_min_messages; + GetAllTransactionXids(¶m->distTransId, &localTransId, &subtransId); + +#ifdef USE_ASSERT_CHECKING + param->gp_udpic_dropseg = gp_udpic_dropseg; + param->gp_udpic_dropacks_percent = gp_udpic_dropacks_percent; + param->gp_udpic_dropxmit_percent = gp_udpic_dropxmit_percent; + param->gp_udpic_fault_inject_percent = gp_udpic_fault_inject_percent; + param->gp_udpic_fault_inject_bitmap = gp_udpic_fault_inject_bitmap; + param->gp_udpic_network_disable_ipv6 = gp_udpic_network_disable_ipv6; +#endif +} + +ICSliceTable* +ConvertToICSliceTable(SliceTable *tbl) +{ + ICSliceTable *ic_tbl = (ICSliceTable *)malloc(sizeof(ICSliceTable)); + memset(ic_tbl, 0, sizeof(ICSliceTable)); + + ic_tbl->localSlice = tbl->localSlice; + ic_tbl->ic_instance_id = tbl->ic_instance_id; + + ic_tbl->numSlices = tbl->numSlices; + ic_tbl->slices = (ICExecSlice *)malloc(sizeof(ICExecSlice) * ic_tbl->numSlices); + memset(ic_tbl->slices, 0, sizeof(ICExecSlice) * ic_tbl->numSlices); + + for (int i = 0; i < ic_tbl->numSlices; ++i) + { + ExecSlice *slice = tbl->slices + i; + ICExecSlice *ic_slice = ic_tbl->slices + i; + + ic_slice->sliceIndex = slice->sliceIndex; + ic_slice->parentIndex= slice->parentIndex; + ic_slice->numSegments = list_length(slice->segments); + + ic_slice->numChildren = list_length(slice->children); + ic_slice->children = malloc(sizeof(int) * ic_slice->numChildren); + memset(ic_slice->children, 0, sizeof(int) * ic_slice->numChildren); + + for (int i = 0; i < ic_slice->numChildren; ++i) + ic_slice->children[i] = list_nth_int(slice->children, i); + + ic_slice->numPrimaryProcesses = list_length(slice->primaryProcesses); + ic_slice->primaryProcesses = malloc(sizeof(ICCdbProcess) * ic_slice->numPrimaryProcesses); + memset(ic_slice->primaryProcesses, 0, sizeof(ICCdbProcess) * ic_slice->numPrimaryProcesses); + + for (int i = 0; i < ic_slice->numPrimaryProcesses; ++i) + { + CdbProcess *process = (CdbProcess *)list_nth(slice->primaryProcesses, i); + if (!process) + continue; + + ICCdbProcess *ic_process = ic_slice->primaryProcesses + i; + + ic_process->valid = true; + ic_process->listenerAddr = process->listenerAddr; + ic_process->listenerPort = process->listenerPort; + ic_process->pid = process->pid; + ic_process->contentid = process->contentid; + ic_process->dbid = process->dbid; + } + } + + return ic_tbl; +} + +/* + * msg MUST BE conn->msgPos, msg_size should be conn->msgSize - sizeof(icpkthdr) + * +----------------+-----------+--------------+------------+---+--------------+------------+ + * | tcp/udp header | ic header | chunk header | chunk data |...| chunk header | chunk data | + * +----------------+-----------+--------------+------------+---+--------------+------------+ + * |<-----#1 tuple chunk ----->|...|<-----#n tuple chunk ----->| + * |<------------------------- Gp_max_packet_size ------------------------>| + * + * |<------------------------ msg_size ----------------------->| + * msg + */ +TupleChunkListItem +ConvertToTupleChunk(ChunkTransportState *transportStates, DataBlock *data) +{ + TupleChunkListItem tcItem; + TupleChunkListItem firstTcItem = NULL; + TupleChunkListItem lastTcItem = NULL; + + uint32 tcSize; + int bytesProcessed = 0; + + while (bytesProcessed != data->len) + { + if (data->len - bytesProcessed < TUPLE_CHUNK_HEADER_SIZE) + { + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error parsing message: insufficient data received"), + errdetail("conn->msgSize %d bytesProcessed %d < chunk-header %d", + data->len, bytesProcessed, TUPLE_CHUNK_HEADER_SIZE))); + } + tcSize = TUPLE_CHUNK_HEADER_SIZE + (*(uint16 *) (data->pos + bytesProcessed)); + + /* sanity check */ + if (tcSize > Gp_max_packet_size) + { + /* + * see MPP-720: it is possible that our message got messed up by a + * cancellation ? + */ + ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); + /* + * MPP-4010: add some extra debugging. + */ + if (lastTcItem != NULL) + elog(LOG, "Interconnect error parsing message: last item length %d inplace %p", lastTcItem->chunk_length, lastTcItem->inplace); + else + elog(LOG, "Interconnect error parsing message: no last item"); + + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error parsing message"), + errdetail("tcSize %d > max %d header %d processed %d/%d from %p", + tcSize, Gp_max_packet_size, + TUPLE_CHUNK_HEADER_SIZE, bytesProcessed, + data->len, data->pos))); + } + + Assert(tcSize <= data->len); + + /* + * We store the data inplace, and handle any necessary copying later + * on + */ + tcItem = (TupleChunkListItem) palloc(sizeof(TupleChunkListItemData)); + tcItem->p_next = NULL; + tcItem->chunk_length = tcSize; + tcItem->inplace = (char *) (data->pos + bytesProcessed); + + bytesProcessed += tcSize; + if (firstTcItem == NULL) + { + firstTcItem = tcItem; + lastTcItem = tcItem; + } + else + { + lastTcItem->p_next = tcItem; + lastTcItem = tcItem; + } + } + + return firstTcItem; +} + +static ChunkTransportState * +CreateChunkTransportState(EState *estate, ICChunkTransportState *udp2_state) +{ + MemoryContext oldContext; + ChunkTransportState *state; + + /* init ChunkTransportState */ + Assert(InterconnectContext != NULL); + oldContext = MemoryContextSwitchTo(InterconnectContext); + state = (ChunkTransportState *)palloc0(sizeof(ChunkTransportState)); + MemoryContextSwitchTo(oldContext); + + state->size = 0; + state->states = NULL; + state->activated = udp2_state->activated; + state->teardownActive = udp2_state->teardownActive; + state->aggressiveRetry = false; + state->incompleteConns = NIL; + state->sliceTable = estate->es_sliceTable; + state->sliceId = estate->es_sliceTable->localSlice; + state->estate = estate; + state->proxyContext = NULL; + + state->networkTimeoutIsLogged = false; + + /* save the reference each other */ + state->implement_state = udp2_state; + udp2_state->clientState = state; + + return state; +} + +/* + * must offer an empty proxy fucntion if ic-proxy is enabled(--enable-ic-proxy). + */ +int +ic_proxy_server_main(void) +{ + /* Establish signal handlers. */ + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* dry run */ + while (true) + { + pg_usleep(1000000); + CHECK_FOR_INTERRUPTS(); + } + + return 0; +} + +/* + * Fucntions for Resource manager + */ +static void +destroy_interconnect_handle(interconnect_handle_t * h) +{ + h->interconnect_context = NULL; + /* unlink from linked list first */ + if (h->prev) + h->prev->next = h->next; + else + open_interconnect_handles = h->next; + if (h->next) + h->next->prev = h->prev; + + pfree(h); + + if (open_interconnect_handles == NULL) + MemoryContextReset(InterconnectContext); +} + +static void +cleanup_interconnect_handle(interconnect_handle_t * h) +{ + if (h->interconnect_context == NULL) + { + destroy_interconnect_handle(h); + return; + } + h->teardown_cb(h->interconnect_context, true); +} + +static void +interconnect_abort_callback(ResourceReleasePhase phase, + bool isCommit, + bool isTopLevel, + void *arg) +{ + interconnect_handle_t *curr; + interconnect_handle_t *next; + + if (phase != RESOURCE_RELEASE_AFTER_LOCKS) + return; + + next = open_interconnect_handles; + while (next) + { + curr = next; + next = curr->next; + + if (curr->owner == CurrentResourceOwner) + { + if (isCommit) + elog(WARNING, "interconnect reference leak: %p still referenced", curr); + + cleanup_interconnect_handle(curr); + } + } +} + +static interconnect_handle_t * +allocate_interconnect_handle(TeardownInterconnectCallBack callback) +{ + interconnect_handle_t *h; + + if (InterconnectContext == NULL) + InterconnectContext = AllocSetContextCreate(TopMemoryContext, + "Interconnect Context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + h = MemoryContextAllocZero(InterconnectContext, sizeof(interconnect_handle_t)); + + h->teardown_cb = callback; + h->owner = CurrentResourceOwner; + h->next = open_interconnect_handles; + h->prev = NULL; + if (open_interconnect_handles) + open_interconnect_handles->prev = h; + open_interconnect_handles = h; + + if (!interconnect_resowner_callback_registered) + { + RegisterResourceReleaseCallback(interconnect_abort_callback, NULL); + interconnect_resowner_callback_registered = true; + } + return h; +} + +static interconnect_handle_t * +find_interconnect_handle(ChunkTransportState * icContext) +{ + interconnect_handle_t *head = open_interconnect_handles; + + while (head != NULL) + { + if (head->interconnect_context == icContext) + return head; + head = head->next; + } + return NULL; +} \ No newline at end of file diff --git a/contrib/udp2/ic_udp2.h b/contrib/udp2/ic_udp2.h new file mode 100644 index 00000000000..c351d2d0224 --- /dev/null +++ b/contrib/udp2/ic_udp2.h @@ -0,0 +1,87 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * ic_udp2.h + * + * IDENTIFICATION + * contrib/udp2/ic_udp2.h + * + *------------------------------------------------------------------------- + */ +#ifndef IC_UDP_H +#define IC_UDP_H + +#include "cdb/cdbinterconnect.h" +#include "nodes/execnodes.h" /* EState, ExecSlice, SliceTable */ + +extern int GetMaxTupleChunkSizeUDP2(void); +extern int32 GetListenPortUDP2(void); + +extern void InitMotionIPCLayerUDP2(void); +extern void CleanUpMotionLayerIPCUDP2(void); + +extern void WaitInterconnectQuitUDPIFC2(void); + +extern void SetupInterconnectUDP2(EState *estate); +extern void TeardownInterconnectUDP2(ChunkTransportState * transportStates, bool hasErrors); + +extern bool SendTupleChunkToAMSUDP2(ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + TupleChunkListItem tcItem); +extern void SendEOSUDPIFC2(ChunkTransportState * transportStates, + int motNodeID, TupleChunkListItem tcItem); +extern void SendStopMessageUDPIFC2(ChunkTransportState * transportStates, int16 motNodeID); + +extern TupleChunkListItem RecvTupleChunkFromAnyUDPIFC2(ChunkTransportState * transportStates, + int16 motNodeID, + int16 *srcRoute); +extern TupleChunkListItem RecvTupleChunkFromUDPIFC2(ChunkTransportState * transportStates, + int16 motNodeID, + int16 srcRoute); + +void MlPutRxBufferUDPIFC2(ChunkTransportState * transportStates, int motNodeID, int route); + +extern void DeregisterReadInterestUDP2(ChunkTransportState * transportStates, + int motNodeID, + int srcRoute, + const char *reason); + +extern uint32 GetActiveMotionConnsUDPIFC2(void); + +extern void GetTransportDirectBufferUDPIFC2(ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + struct directTransportBuffer *b); +extern void PutTransportDirectBufferUDPIFC2(ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + int length); + +extern TupleRemapper* GetMotionConnTupleRemapperUDPIFC2(ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute); + +extern int32* GetMotionSentRecordTypmodUDPIFC2(ChunkTransportState * transportStates, + int16 motNodeID, + int16 targetRoute); + +extern int ic_proxy_server_main(void); + +#endif // IC_UDP_H \ No newline at end of file diff --git a/deploy/README.md b/deploy/README.md deleted file mode 100644 index b8da77a0521..00000000000 --- a/deploy/README.md +++ /dev/null @@ -1,21 +0,0 @@ - - -> [!WARNING] -> The files are still in progress and will be revised in the following months. diff --git a/devops/README.md b/devops/README.md new file mode 100644 index 00000000000..df440a2cc39 --- /dev/null +++ b/devops/README.md @@ -0,0 +1,108 @@ + + +# Auto-Build Apache Cloudberry from Source Code + +You can build Apache Cloudberry from source code in two ways: manually or automatically. + +For the manual build, you need to manually set up many system configurations and download third-party dependencies, which is quite cumbersome and error-prone. + +To make the job easier, it is recommended that you use the automated deployment method and scripts provided here. The automation method simplifies the deployment process, reduces time costs, and allows developers to focus more on business code development. + +## 1. Setup Docker environment + +Nothing special, just follow the [official documentation](https://docs.docker.com/engine/install/) to install Docker on your machine based on your OS. + +## 2. Create Docker build image + +Go to the supported OS directory, for example Rocky Linux 8: + +```bash +cd devops/deploy/docker/build/rocky8/ +``` + +And build image: + +```bash +docker build -t apache-cloudberry-env . +``` + +The whole process usually takes about 5 minutes. You can use the created base image as many times as you want, just launch a new container for your specific task. + +## 3. Launch container + +Launch the container in detached mode with a long-running process: + +```bash +docker run -h cdw -d --name cloudberry-build apache-cloudberry-env bash -c "/tmp/init_system.sh && tail -f /dev/null" +``` + +> [!NOTE] +> The container will be named `cloudberry-build` and run in the background for easy reference in subsequent commands. +> If you need to: +> - access the container interactively, use `docker exec -it cloudberry-build bash` +> - check if the container is running, use `docker ps` + +## 4. Checkout git repo inside container + +The same way you did it on your laptop + +```bash +docker exec cloudberry-build bash -c "cd /home/gpadmin && git clone --recurse-submodules --branch main --depth 1 https://github.com/apache/cloudberry.git" +``` + +## 5. Set environment and configure build container + +Create direcory for store logs: + +```bash +SRC_DIR=/home/gpadmin/cloudberry && docker exec cloudberry-build bash -c "mkdir ${SRC_DIR}/build-logs" +``` + +Execute configure and check if system is ready for build: + +```bash +SRC_DIR=/home/gpadmin/cloudberry && docker exec cloudberry-build bash -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ./devops/build/automation/cloudberry/scripts/configure-cloudberry.sh" +``` + +## 6. Build and install binary + +The building consumes all available CPU resources and can take minutes to complete: + +```bash +SRC_DIR=/home/gpadmin/cloudberry && docker exec cloudberry-build bash -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ./devops/build/automation/cloudberry/scripts/build-cloudberry.sh" +``` + +## 7. Install binary and create demo cluster + +The build script above has already installed the binaries to `/usr/local/cloudberry-db` inside the container. Now create the demo cluster just launch `create-cloudberry-demo-cluster.sh` + +```bash +SRC_DIR=/home/gpadmin/cloudberry && docker exec cloudberry-build bash -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ./devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh" +``` + +## 8. Execute test query + +Now you could set environment and execute queries: + +```bash +docker exec cloudberry-build bash -c "source /usr/local/cloudberry-db/cloudberry-env.sh && source /home/gpadmin/cloudberry/gpAux/gpdemo/gpdemo-env.sh && psql -U gpadmin -d postgres -c 'SELECT 42'" +``` + +All done! diff --git a/devops/build/automation/cloudberry/scripts/analyze_core_dumps.sh b/devops/build/automation/cloudberry/scripts/analyze_core_dumps.sh new file mode 100755 index 00000000000..4749ec76271 --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/analyze_core_dumps.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Script: analyze_core_dumps.sh +# Description: Automated analysis tool for core dump files using GDB +# +# This script automatically analyzes core dump files found in a +# specified directory, providing stack traces and register +# information. It's particularly useful for analyzing crashes in +# Postgres/Cloudberry executables and Python applications. +# +# Features: +# - Automatic detection of core dump files +# - Support for both compiled executables and interpreted languages +# - Detailed stack traces with GDB +# - Register state analysis +# - Assembly code context at crash point +# - Comprehensive logging of analysis results +# +# Usage: analyze_core_dumps.sh [test_id] +# test_id: Optional identifier for the test configuration that generated cores +# +# Dependencies: +# - GDB (GNU Debugger) +# - file command +# +# Environment Variables: +# SRC_DIR - Base directory for operations (defaults to /tmp) +# +# Return Codes: +# 0 - No core files were found +# 1 - Core files were found and all were processed successfully +# 2 - Error conditions: +# - Missing required dependencies (gdb, file) +# - Issues processing some or all core files +# -------------------------------------------------------------------- + +set -u + +# Configuration +#----------------------------------------------------------------------------- +# Use SRC_DIR if defined, otherwise default to /tmp +SRC_DIR="${SRC_DIR:-/tmp}" +# Define log directory and files +LOG_DIR="${SRC_DIR}/build-logs" +# Create log directories if they don't exist +mkdir -p "${LOG_DIR}" + +# Determine log file name based on test_id argument +if [ $# -ge 1 ]; then + test_id="$1" + log_file="${LOG_DIR}/core_analysis_${test_id}_$(date +%Y%m%d_%H%M%S).log" +else + log_file="${LOG_DIR}/core_analysis_$(date +%Y%m%d_%H%M%S).log" +fi +echo "log_file: ${log_file}" + +# Directory where core dumps are located +core_dir="/tmp/cloudberry-cores/" + +# Pattern to match core dump files +core_pattern="core-*" + +# Function Definitions +#----------------------------------------------------------------------------- +# Log messages to both console and log file +# Args: +# $1 - Message to log +log_message() { + local message="[$(date '+%Y-%m-%d %H:%M:%S')] $1" + echo "$message" + echo "$message" >> "$log_file" +} + +# Analyze a single core file +# Args: +# $1 - Path to core file +# Returns: +# 0 on success, 1 on failure +analyze_core_file() { + local core_file="$1" + local file_info + + log_message "Analyzing core file: $core_file" + file_info=$(file "$core_file") + log_message "Core file info: $file_info" + + # Extract the original command from the core file info + if [[ "$file_info" =~ "from '([^']+)'" ]]; then + local original_cmd="${BASH_REMATCH[1]}" + log_message "Original command: $original_cmd" + fi + + # Extract executable path from core file info + if [[ "$file_info" =~ execfn:\ \'([^\']+)\' ]]; then + local executable="${BASH_REMATCH[1]}" + log_message "Executable path: $executable" + + # Convert relative path to absolute if needed + if [[ "$executable" == "./"* ]]; then + executable="$PWD/${executable:2}" + log_message "Converted to absolute path: $executable" + fi + + # Run GDB analysis + log_message "Starting GDB analysis..." + + gdb -quiet \ + --batch \ + -ex 'set pagination off' \ + -ex 'info target' \ + -ex 'thread apply all bt' \ + -ex 'print $_siginfo' \ + -ex quit \ + "$executable" "$core_file" 2>&1 >> "$log_file" + + local gdb_rc=$? + if [ $gdb_rc -eq 0 ] && [ -s "$log_file" ]; then + log_message "GDB analysis completed successfully" + return 0 + else + log_message "Warning: GDB analysis failed or produced no output" + return 1 + fi + else + log_message "Could not find executable path in core file" + return 1 + fi +} + +# Function to check required commands +check_dependencies() { + local missing=0 + local required_commands=("gdb" "file") + + log_message "Checking required commands..." + for cmd in "${required_commands[@]}"; do + if ! command -v "$cmd" >/dev/null 2>&1; then + log_message "Error: Required command '$cmd' not found" + missing=1 + fi + done + + if [ $missing -eq 1 ]; then + log_message "Missing required dependencies. Please install them and try again." + return 1 + fi + + log_message "All required commands found" + return 0 +} + +# Main Execution +#----------------------------------------------------------------------------- +main() { + local core_count=0 + local analyzed_count=0 + local return_code=0 + + log_message "Starting core dump analysis" + log_message "Using source directory: $SRC_DIR" + log_message "Using log directory: $LOG_DIR" + + # Check dependencies first + if ! check_dependencies; then + return 2 + fi + + # Process all core files + for core_file in "$core_dir"/$core_pattern; do + if [[ -f "$core_file" ]]; then + ((core_count++)) + if analyze_core_file "$core_file"; then + ((analyzed_count++)) + fi + fi + done + + # Determine return code based on results + if ((core_count == 0)); then + log_message "No core files found matching pattern $core_pattern in $core_dir" + return_code=0 # No cores found + elif ((analyzed_count == core_count)); then + log_message "Analysis complete. Successfully processed $analyzed_count core(s) files" + return_code=1 # All cores processed successfully + else + log_message "Analysis complete with errors. Processed $analyzed_count of $core_count core files" + return_code=2 # Some cores failed to process + fi + + log_message "Log file: $log_file" + + return $return_code +} + +# Script entry point +main +return_code=$? + +if ((return_code == 0)); then + rm -fv "${log_file}" +fi + +exit $return_code diff --git a/devops/build/automation/cloudberry/scripts/build-cloudberry.sh b/devops/build/automation/cloudberry/scripts/build-cloudberry.sh new file mode 100755 index 00000000000..efa061a0f83 --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/build-cloudberry.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Script: build-cloudberry.sh +# Description: Builds Apache Cloudberry from source code and installs +# it. +# Performs the following steps: +# 1. Builds main Apache Cloudberry database components +# 2. Builds contrib modules +# 3. Installs both main and contrib components +# Uses parallel compilation based on available CPU cores. +# +# Required Environment Variables: +# SRC_DIR - Root source directory containing Apache Cloudberry +# source code +# +# Optional Environment Variables: +# LOG_DIR - Directory for logs (defaults to ${SRC_DIR}/build-logs) +# NPROC - Number of parallel jobs (defaults to all available cores) +# +# Usage: +# Export required variables: +# export SRC_DIR=/path/to/cloudberry/source +# Then run: +# ./build-cloudberry.sh +# +# Prerequisites: +# - configure-cloudberry.sh must be run first +# - Required build dependencies must be installed +# - ${BUILD_DESTINATION}/lib (by default /usr/local/cloudberry-db/lib) must exist and be writable +# +# Exit Codes: +# 0 - Build and installation completed successfully +# 1 - Environment setup failed (missing SRC_DIR, LOG_DIR creation failed) +# 2 - Main component build failed +# 3 - Contrib build failed +# 4 - Installation failed +# +# -------------------------------------------------------------------- + +set -euo pipefail + +# Source common utilities +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}/cloudberry-utils.sh" + +# Define log directory and files +export LOG_DIR="${SRC_DIR}/build-logs" +BUILD_LOG="${LOG_DIR}/build.log" + +# Initialize environment +init_environment "Cloudberry Build Script" "${BUILD_LOG}" + +# Set environment +log_section "Environment Setup" +export LD_LIBRARY_PATH=${BUILD_DESTINATION}/lib:LD_LIBRARY_PATH +log_section_end "Environment Setup" + +# Build process +log_section "Build Process" +execute_cmd make -j$(nproc) --directory ${SRC_DIR} || exit 2 +execute_cmd make -j$(nproc) --directory ${SRC_DIR}/contrib || exit 3 +log_section_end "Build Process" + +# Installation +log_section "Installation" +execute_cmd make install --directory ${SRC_DIR} || exit 4 +execute_cmd make install --directory ${SRC_DIR}/contrib || exit 4 +log_section_end "Installation" + +# Log completion +log_completion "Cloudberry Build Script" "${BUILD_LOG}" +exit 0 diff --git a/devops/build/automation/cloudberry/scripts/cloudberry-utils.sh b/devops/build/automation/cloudberry/scripts/cloudberry-utils.sh new file mode 100755 index 00000000000..c8d0f8cc44e --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/cloudberry-utils.sh @@ -0,0 +1,175 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Library: cloudberry-utils.sh +# Description: Common utility functions for Apache Cloudberry build +# and test scripts +# +# Required Environment Variables: +# SRC_DIR - Root source directory +# +# Optional Environment Variables: +# LOG_DIR - Directory for logs (defaults to ${SRC_DIR}/build-logs) +# +# Functions: +# init_environment "Script Name" "Log File" +# - Initialize logging and verify environment +# - Parameters: +# * script_name: Name of the calling script +# * log_file: Path to log file +# - Returns: 0 on success, 1 on failure +# +# execute_cmd command [args...] +# - Execute command with logging +# - Parameters: Command and its arguments +# - Returns: Command's exit code +# +# run_psql_cmd "sql_command" +# - Execute PostgreSQL command with logging +# - Parameters: SQL command string +# - Returns: psql command's exit code +# +# source_cloudberry_env +# - Source Cloudberry environment files +# - Returns: 0 on success +# +# log_section "section_name" +# - Log section start +# - Parameters: Name of the section +# +# log_section_end "section_name" +# - Log section end +# - Parameters: Name of the section +# +# log_completion "script_name" "log_file" +# - Log script completion +# - Parameters: +# * script_name: Name of the calling script +# * log_file: Path to log file +# +# Usage: +# source ./cloudberry-utils.sh +# +# Example: +# source ./cloudberry-utils.sh +# init_environment "My Script" "${LOG_FILE}" +# execute_cmd make clean +# log_section "Build Process" +# execute_cmd make -j$(nproc) +# log_section_end "Build Process" +# log_completion "My Script" "${LOG_FILE}" +# +# -------------------------------------------------------------------- + +DEFAULT_BUILD_DESTINATION=/usr/local/cloudberry-db + +# Initialize logging and environment +init_environment() { + local script_name=$1 + local log_file=$2 + + init_build_destination_var + + echo "=== Initializing environment for ${script_name} ===" + echo "${script_name} executed at $(date)" | tee -a "${log_file}" + echo "Whoami: $(whoami)" | tee -a "${log_file}" + echo "Hostname: $(hostname)" | tee -a "${log_file}" + echo "Working directory: $(pwd)" | tee -a "${log_file}" + echo "Source directory: ${SRC_DIR}" | tee -a "${log_file}" + echo "Log directory: ${LOG_DIR}" | tee -a "${log_file}" + echo "Build destination: ${BUILD_DESTINATION}" | tee -a "${log_file}" + + if [ -z "${SRC_DIR:-}" ]; then + echo "Error: SRC_DIR environment variable is not set" | tee -a "${log_file}" + exit 1 + fi + + mkdir -p "${LOG_DIR}" +} + +# Function to echo and execute command with logging +execute_cmd() { + local cmd_str="$*" + local timestamp=$(date "+%Y.%m.%d-%H.%M.%S") + echo "Executing at ${timestamp}: $cmd_str" | tee -a "${LOG_DIR}/commands.log" + "$@" 2>&1 | tee -a "${LOG_DIR}/commands.log" + return ${PIPESTATUS[0]} +} + +# Function to run psql commands with logging +run_psql_cmd() { + local cmd=$1 + local timestamp=$(date "+%Y.%m.%d-%H.%M.%S") + echo "Executing psql at ${timestamp}: $cmd" | tee -a "${LOG_DIR}/psql-commands.log" + psql -P pager=off template1 -c "$cmd" 2>&1 | tee -a "${LOG_DIR}/psql-commands.log" + return ${PIPESTATUS[0]} +} + +# Function to source Cloudberry environment +source_cloudberry_env() { + echo "=== Sourcing Cloudberry environment ===" | tee -a "${LOG_DIR}/environment.log" + source ${BUILD_DESTINATION}/cloudberry-env.sh + source ${SRC_DIR}/../cloudberry/gpAux/gpdemo/gpdemo-env.sh +} + +# Function to log section start +log_section() { + local section_name=$1 + local timestamp=$(date "+%Y.%m.%d-%H.%M.%S") + echo "=== ${section_name} started at ${timestamp} ===" | tee -a "${LOG_DIR}/sections.log" +} + +# Function to log section end +log_section_end() { + local section_name=$1 + local timestamp=$(date "+%Y.%m.%d-%H.%M.%S") + echo "=== ${section_name} completed at ${timestamp} ===" | tee -a "${LOG_DIR}/sections.log" +} + +# Function to log script completion +log_completion() { + local script_name=$1 + local log_file=$2 + local timestamp=$(date "+%Y.%m.%d-%H.%M.%S") + echo "${script_name} execution completed successfully at ${timestamp}" | tee -a "${log_file}" +} + +# Function to get OS identifier +detect_os() { + if [ -f /etc/os-release ]; then + . /etc/os-release + OS_ID=$ID + OS_VERSION=$VERSION_ID + else + echo "Unsupported system: cannot detect OS" >&2 + exit 99 + fi +} + +# Init BUILD_DESTINATION default value if not set +init_build_destination_var() { + + if [ -z ${BUILD_DESTINATION+x} ]; then + export BUILD_DESTINATION=${DEFAULT_BUILD_DESTINATION} + exec "$@" + fi + +} diff --git a/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh b/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh new file mode 100755 index 00000000000..bc046695032 --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/configure-cloudberry.sh @@ -0,0 +1,178 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Script: configure-cloudberry.sh +# Description: Configures Apache Cloudberry build environment and runs +# ./configure with optimized settings. Performs the +# following: +# 1. Prepares ${BUILD_DESTINATION} (by default /usr/local/cloudberry-db) directory +# 2. Sets up library dependencies +# 3. Configures build with required features enabled +# +# Configuration Features: +# - Cloud Storage Integration (gpcloud) +# - IC Proxy Support +# - MapReduce Processing +# - Oracle Compatibility (orafce) +# - ORCA Query Optimizer +# - PAX Access Method +# - PXF External Table Access +# - Test Automation Support (tap-tests) +# +# System Integration: +# - GSSAPI Authentication +# - LDAP Authentication +# - XML Processing +# - LZ4 Compression +# - OpenSSL Support +# - PAM Authentication +# - Perl Support +# - Python Support +# +# Required Environment Variables: +# SRC_DIR - Root source directory +# BUILD_DESTINATION - Directory to build binaries +# +# Optional Environment Variables: +# LOG_DIR - Directory for logs (defaults to ${SRC_DIR}/build-logs) +# ENABLE_DEBUG - Enable debug build options (true/false, defaults to +# false) +# +# When true, enables: +# --enable-debug +# --enable-profiling +# --enable-cassert +# --enable-debug-extensions +# +# Prerequisites: +# - System dependencies must be installed: +# * xerces-c development files +# * OpenSSL development files +# * Python development files +# * Perl development files +# * LDAP development files +# - /usr/local must be writable +# - User must have sudo privileges +# +# Usage: +# Export required variables: +# export SRC_DIR=/path/to/cloudberry/source +# Then run: +# ./configure-cloudberry.sh +# +# Exit Codes: +# 0 - Configuration completed successfully +# 1 - Environment setup failed +# 2 - Directory preparation failed +# 3 - Library setup failed +# 4 - Configure command failed +# +# -------------------------------------------------------------------- + +set -euo pipefail + +# Source common utilities +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}/cloudberry-utils.sh" + +# Call it before conditional logic +detect_os +echo "Detected OS: $OS_ID $OS_VERSION" + +# Define log directory and files +export LOG_DIR="${SRC_DIR}/build-logs" +CONFIGURE_LOG="${LOG_DIR}/configure.log" + +# Initialize environment +init_environment "Cloudberry Configure Script" "${CONFIGURE_LOG}" + +# Check if BUILD_DESTINATION is set +if [ -z "${BUILD_DESTINATION}" ]; then + log_completion "BUILD_DESTINATION is empty - error with exit" + exit 1 +fi + +# Initial setup +log_section "Initial Setup" +execute_cmd sudo rm -rf ${BUILD_DESTINATION} || exit 2 +execute_cmd sudo chmod a+w /usr/local || exit 2 +execute_cmd sudo mkdir -p ${BUILD_DESTINATION}/lib || exit 2 +if [[ "$OS_ID" == "rocky" && "$OS_VERSION" =~ ^(8|9) ]]; then + execute_cmd sudo cp /usr/local/xerces-c/lib/libxerces-c.so \ + /usr/local/xerces-c/lib/libxerces-c-3.3.so \ + ${BUILD_DESTINATION}/lib || exit 3 +fi +execute_cmd sudo chown -R gpadmin:gpadmin ${BUILD_DESTINATION} || exit 2 +log_section_end "Initial Setup" + +# Set environment +log_section "Environment Setup" +export LD_LIBRARY_PATH=${BUILD_DESTINATION}/lib:LD_LIBRARY_PATH +log_section_end "Environment Setup" + +# Add debug options if ENABLE_DEBUG is set to "true" +CONFIGURE_DEBUG_OPTS="" + +if [ "${ENABLE_DEBUG:-false}" = "true" ]; then + CONFIGURE_DEBUG_OPTS="--enable-debug \ + --enable-profiling \ + --enable-cassert \ + --enable-debug-extensions" +fi + +# Configure build +log_section "Configure" +execute_cmd ./configure --prefix=${BUILD_DESTINATION} \ + --disable-external-fts \ + --enable-gpcloud \ + --enable-ic-proxy \ + --enable-mapreduce \ + --enable-orafce \ + --enable-orca \ + --enable-pax \ + --enable-pxf \ + --enable-tap-tests \ + ${CONFIGURE_DEBUG_OPTS} \ + --with-gssapi \ + --with-ldap \ + --with-libxml \ + --with-lz4 \ + --with-openssl \ + --with-pam \ + --with-perl \ + --with-pgport=5432 \ + --with-python \ + --with-pythonsrc-ext \ + --with-ssl=openssl \ + --with-openssl \ + --with-uuid=e2fs \ + --with-includes=/usr/local/xerces-c/include \ + --with-libraries=${BUILD_DESTINATION}/lib || exit 4 +log_section_end "Configure" + +# Capture version information +log_section "Version Information" +execute_cmd ag "GP_VERSION | GP_VERSION_NUM | PG_VERSION | PG_VERSION_NUM | PG_VERSION_STR" src/include/pg_config.h +log_section_end "Version Information" + +# Log completion +log_completion "Cloudberry Configure Script" "${CONFIGURE_LOG}" +exit 0 diff --git a/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh b/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh new file mode 100755 index 00000000000..ec582e80402 --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/create-cloudberry-demo-cluster.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Script: create-cloudberry-demo-cluster.sh +# Description: Creates and configures a demo Apache Cloudbery cluster. +# Performs the following steps: +# 1. Sets up required environment variables +# 2. Verifies SSH connectivity +# 3. Creates demo cluster using make +# 4. Initializes and starts the cluster +# 5. Performs comprehensive verification checks +# +# Required Environment Variables: +# SRC_DIR - Root source directory +# +# Optional Environment Variables: +# LOG_DIR - Directory for logs (defaults to ${SRC_DIR}/build-logs) +# +# Prerequisites: +# - Apache Cloudberry must be installed (default location is /usr/local/cloudberry-db) +# - SSH must be configured for passwordless access to localhost +# - User must have permissions to create cluster directories +# - PostgreSQL client tools (psql) must be available +# +# Usage: +# Export required variables: +# export SRC_DIR=/path/to/cloudberry/source +# Then run: +# ./create-cloudberry-demo-cluster.sh +# +# Verification Checks: +# - Apache Cloudberry version +# - Segment configuration +# - Available extensions +# - Active sessions +# - Configuration history +# - Replication status +# +# Exit Codes: +# 0 - Cluster created and verified successfully +# 1 - Environment setup failed +# 2 - SSH verification failed +# 3 - Cluster creation failed +# 4 - Cluster startup failed +# 5 - Verification checks failed +# +# -------------------------------------------------------------------- + +set -euo pipefail + +# Source common utilities +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}/cloudberry-utils.sh" + +# Define log directory +export LOG_DIR="${SRC_DIR}/build-logs" +CLUSTER_LOG="${LOG_DIR}/cluster.log" + +# Initialize environment +init_environment "Cloudberry Demo Cluster Script" "${CLUSTER_LOG}" + +# Setup environment +log_section "Environment Setup" +source ${BUILD_DESTINATION}/cloudberry-env.sh || exit 1 +log_section_end "Environment Setup" + +# Verify SSH access +log_section "SSH Verification" +execute_cmd ssh $(hostname) 'whoami; hostname' || exit 2 +log_section_end "SSH Verification" + +# Create demo cluster +log_section "Demo Cluster Creation" +execute_cmd make create-demo-cluster --directory ${SRC_DIR}/../cloudberry || exit 3 +log_section_end "Demo Cluster Creation" + +# Source demo environment +log_section "Source Environment" +source ${SRC_DIR}/../cloudberry/gpAux/gpdemo/gpdemo-env.sh || exit 1 +log_section_end "Source Environment" + +# Manage cluster state +log_section "Cluster Management" +execute_cmd gpstop -a || exit 4 +execute_cmd gpstart -a || exit 4 +execute_cmd gpstate || exit 4 +log_section_end "Cluster Management" + +# Verify installation +log_section "Installation Verification" +verification_failed=false +run_psql_cmd "SELECT version()" || verification_failed=true +run_psql_cmd "SELECT * from gp_segment_configuration" || verification_failed=true +run_psql_cmd "SELECT * FROM pg_available_extensions" || verification_failed=true +run_psql_cmd "SELECT * from pg_stat_activity" || verification_failed=true +run_psql_cmd "SELECT * FROM gp_configuration_history" || verification_failed=true +run_psql_cmd "SELECT * FROM gp_stat_replication" || verification_failed=true + +if [ "$verification_failed" = true ]; then + echo "One or more verification checks failed" | tee -a "${CLUSTER_LOG}" + exit 5 +fi +log_section_end "Installation Verification" + +# Log completion +log_completion "Cloudberry Demo Cluster Script" "${CLUSTER_LOG}" +exit 0 diff --git a/devops/build/automation/cloudberry/scripts/destroy-cloudberry-demo-cluster.sh b/devops/build/automation/cloudberry/scripts/destroy-cloudberry-demo-cluster.sh new file mode 100755 index 00000000000..3d4ce241979 --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/destroy-cloudberry-demo-cluster.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Script: destroy-cloudberry-demo-cluster.sh +# Description: Destroys and cleans up a demo Apache Cloudberry +# cluster. +# Performs the following steps: +# 1. Sources required environment variables +# 2. Stops any running cluster processes +# 3. Removes cluster data directories and configuration +# 4. Cleans up any remaining cluster resources +# +# Required Environment Variables: +# SRC_DIR - Root source directory +# +# Optional Environment Variables: +# LOG_DIR - Directory for logs (defaults to ${SRC_DIR}/build-logs) +# +# Prerequisites: +# - Apache Cloudberry environment must be available +# - User must have permissions to remove cluster directories +# - No active connections to the cluster +# +# Usage: +# Export required variables: +# export SRC_DIR=/path/to/cloudberry/source +# Then run: +# ./destroy-cloudberry-demo-cluster.sh +# +# Exit Codes: +# 0 - Cluster destroyed successfully +# 1 - Environment setup/sourcing failed +# 2 - Cluster destruction failed +# +# Related Scripts: +# - create-cloudberry-demo-cluster.sh: Creates a new demo cluster +# +# Notes: +# - This script will forcefully terminate all cluster processes +# - All cluster data will be permanently deleted +# - Make sure to backup any important data before running +# +# -------------------------------------------------------------------- + +set -euo pipefail + +# Source common utilities +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}/cloudberry-utils.sh" + +# Define log directory +export LOG_DIR="${SRC_DIR}/build-logs" +CLUSTER_LOG="${LOG_DIR}/destroy-cluster.log" + +# Initialize environment +init_environment "Destroy Cloudberry Demo Cluster Script" "${CLUSTER_LOG}" + +# Source Cloudberry environment +log_section "Environment Setup" +source_cloudberry_env || { + echo "Failed to source Cloudberry environment" | tee -a "${CLUSTER_LOG}" + exit 1 +} +log_section_end "Environment Setup" + +# Destroy demo cluster +log_section "Destroy Demo Cluster" +execute_cmd make destroy-demo-cluster --directory ${SRC_DIR}/../cloudberry || { + echo "Failed to destroy demo cluster" | tee -a "${CLUSTER_LOG}" + exit 2 +} +log_section_end "Destroy Demo Cluster" + +# Verify cleanup +log_section "Cleanup Verification" +if [ -d "${SRC_DIR}/../cloudberry/gpAux/gpdemo/data" ]; then + echo "Warning: Data directory still exists after cleanup" | tee -a "${CLUSTER_LOG}" +fi +log_section_end "Cleanup Verification" + +# Log completion +log_completion "Destroy Cloudberry Demo Cluster Script" "${CLUSTER_LOG}" +exit 0 diff --git a/devops/build/automation/cloudberry/scripts/parse-results.pl b/devops/build/automation/cloudberry/scripts/parse-results.pl new file mode 100755 index 00000000000..d09085d5fb9 --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/parse-results.pl @@ -0,0 +1,215 @@ +#!/usr/bin/env perl +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Script: parse_results.pl +# Description: Processes Cloudberry test output to extract statistics +# and results. +# Analyzes test log files to determine: +# 1. Overall test status (pass/fail) +# 2. Total number of tests run +# 3. Number of passed, failed, and ignored tests +# 4. Names of failed and ignored tests +# 5. Validates test counts for consistency +# Results are written to a file for shell script processing. +# +# Arguments: +# log-file Path to test log file (required) +# +# Input File Format: +# Expects test log files containing one of the following summary formats: +# - "All X tests passed." +# - "Y of X tests failed." +# - "X of Y tests passed, Z failed test(s) ignored." +# - "X of Y tests failed, Z of these failures ignored." +# +# And failed or ignored test entries in format: +# - "test_name ... FAILED" +# - "test_name ... failed (ignored)" +# +# Output File (test_results.txt): +# Environment variable format: +# STATUS=passed|failed +# TOTAL_TESTS= +# FAILED_TESTS= +# PASSED_TESTS= +# IGNORED_TESTS= +# FAILED_TEST_NAMES= +# IGNORED_TEST_NAMES= +# +# Prerequisites: +# - Read access to input log file +# - Write access to current directory +# - Perl 5.x or higher +# +# Exit Codes: +# 0 - All tests passed, or only ignored failures occurred +# 1 - Some non-ignored tests failed +# 2 - Parse error or cannot access files +# +# Example Usage: +# ./parse_results.pl test_output.log +# +# Error Handling: +# - Validates input file existence and readability +# - Verifies failed and ignored test counts match found entries +# - Reports parsing errors with detailed messages +# +# -------------------------------------------------------------------- + +use strict; +use warnings; + +# Exit codes +use constant { + SUCCESS => 0, + TEST_FAILURE => 1, + PARSE_ERROR => 2 +}; + +# Get log file path from command line argument +my $file = $ARGV[0] or die "Usage: $0 LOG_FILE\n"; +print "Parsing test results from: $file\n"; + +# Check if file exists and is readable +unless (-e $file) { + print "Error: File does not exist: $file\n"; + exit PARSE_ERROR; +} +unless (-r $file) { + print "Error: File is not readable: $file\n"; + exit PARSE_ERROR; +} + +# Open and parse the log file +open(my $fh, '<', $file) or do { + print "Cannot open log file: $! (looking in $file)\n"; + exit PARSE_ERROR; +}; + +# Initialize variables +my ($status, $total_tests, $failed_tests, $ignored_tests, $passed_tests) = ('', 0, 0, 0, 0); +my @failed_test_list = (); +my @ignored_test_list = (); + +while (<$fh>) { + # Match the summary lines + if (/All (\d+) tests passed\./) { + $status = 'passed'; + $total_tests = $1; + $passed_tests = $1; + } + elsif (/(\d+) of (\d+) tests passed, (\d+) failed test\(s\) ignored\./) { + $status = 'passed'; + $passed_tests = $1; + $total_tests = $2; + $ignored_tests = $3; + } + elsif (/(\d+) of (\d+) tests failed\./) { + $status = 'failed'; + $failed_tests = $1; + $total_tests = $2; + $passed_tests = $2 - $1; + } + elsif (/(\d+) of (\d+) tests failed, (\d+) of these failures ignored\./) { + $status = 'failed'; + $failed_tests = $1 - $3; + $ignored_tests = $3; + $total_tests = $2; + $passed_tests = $2 - $1; + } + + # Capture failed tests + if (/^(?:\s+|test\s+)(\S+)\s+\.\.\.\s+FAILED\s+/) { + push @failed_test_list, $1; + } + + # Capture ignored tests + if (/^(?:\s+|test\s+)(\S+)\s+\.\.\.\s+failed \(ignored\)/) { + push @ignored_test_list, $1; + } +} + +# Close the log file +close $fh; + +# Validate failed test count matches found test names +if ($status eq 'failed' && scalar(@failed_test_list) != $failed_tests) { + print "Error: Found $failed_tests failed tests in summary but found " . scalar(@failed_test_list) . " failed test names\n"; + print "Failed test names found:\n"; + foreach my $test (@failed_test_list) { + print " - $test\n"; + } + exit PARSE_ERROR; +} + +# Validate ignored test count matches found test names +if ($ignored_tests != scalar(@ignored_test_list)) { + print "Error: Found $ignored_tests ignored tests in summary but found " . scalar(@ignored_test_list) . " ignored test names\n"; + print "Ignored test names found:\n"; + foreach my $test (@ignored_test_list) { + print " - $test\n"; + } + exit PARSE_ERROR; +} + +# Write results to the results file +open my $result_fh, '>', 'test_results.txt' or die "Cannot write to results file: $!\n"; +print $result_fh "STATUS=$status\n"; +print $result_fh "TOTAL_TESTS=$total_tests\n"; +print $result_fh "PASSED_TESTS=$passed_tests\n"; +print $result_fh "FAILED_TESTS=$failed_tests\n"; +print $result_fh "IGNORED_TESTS=$ignored_tests\n"; +if (@failed_test_list) { + print $result_fh "FAILED_TEST_NAMES=" . join(',', @failed_test_list) . "\n"; +} +if (@ignored_test_list) { + print $result_fh "IGNORED_TEST_NAMES=" . join(',', @ignored_test_list) . "\n"; +} +close $result_fh; + +# Print to stdout for logging +print "Test Results:\n"; +print "Status: $status\n"; +print "Total Tests: $total_tests\n"; +print "Failed Tests: $failed_tests\n"; +print "Ignored Tests: $ignored_tests\n"; +print "Passed Tests: $passed_tests\n"; +if (@failed_test_list) { + print "Failed Test Names:\n"; + foreach my $test (@failed_test_list) { + print " - $test\n"; + } +} +if (@ignored_test_list) { + print "Ignored Test Names:\n"; + foreach my $test (@ignored_test_list) { + print " - $test\n"; + } +} + +# Exit with appropriate code +if ($status eq 'passed') { + exit SUCCESS; +} elsif ($status eq 'failed') { + exit TEST_FAILURE; +} else { + exit PARSE_ERROR; +} diff --git a/devops/build/automation/cloudberry/scripts/parse-test-results.sh b/devops/build/automation/cloudberry/scripts/parse-test-results.sh new file mode 100755 index 00000000000..ace00f63b3f --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/parse-test-results.sh @@ -0,0 +1,100 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Script: parse-test-results.sh +# Description: Parses Apache Cloudberry test results and processes the +# output. +# Provides GitHub Actions integration and environment +# variable export functionality. This script is a wrapper +# around parse_results.pl, adding the following features: +# 1. Default log file path handling +# 2. GitHub Actions output integration +# 3. Environment variable management +# 4. Result file cleanup +# +# Arguments: +# [log-file] - Path to test log file +# (defaults to build-logs/details/make-${MAKE_NAME}.log) +# +# Prerequisites: +# - parse_results.pl must be in the same directory +# - Perl must be installed and in PATH +# - Write access to current directory (for temporary files) +# - Read access to test log file +# +# Output Variables (in GitHub Actions): +# status - Test status (passed/failed) +# total_tests - Total number of tests +# failed_tests - Number of failed tests +# passed_tests - Number of passed tests +# ignored_tests - Number of ignored tests +# failed_test_names - Names of failed tests (comma-separated) +# ignored_test_names - Names of ignored tests (comma-separated) +# +# Usage Examples: +# # Parse default log file: +# ./parse-test-results.sh +# +# # Parse specific log file: +# ./parse-test-results.sh path/to/test.log +# +# # Use with GitHub Actions: +# export GITHUB_OUTPUT=/path/to/output +# ./parse-test-results.sh +# +# Exit Codes: +# 0 - All tests passed successfully +# 1 - Tests failed but results were properly parsed +# 2 - Parse error, missing files, or unknown status +# +# Files Created/Modified: +# - Temporary: test_results.txt (automatically cleaned up) +# - If GITHUB_OUTPUT set: Appends results to specified file +# +# -------------------------------------------------------------------- + +set -uo pipefail + +# Default log file path +DEFAULT_LOG_PATH="build-logs/details/make-${MAKE_NAME}.log" +LOG_FILE=${1:-$DEFAULT_LOG_PATH} + +# Get the directory where this script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# Check if log file exists +if [ ! -f "$LOG_FILE" ]; then + echo "Error: Test log file not found: $LOG_FILE" + exit 2 +fi + +# Run the perl script and capture its exit code +perl "${SCRIPT_DIR}/parse-results.pl" "$LOG_FILE" +perl_exit_code=$? + +# Check if results file exists and source it if it does +if [ ! -f test_results.txt ]; then + echo "Error: No results file generated" + exit 2 +fi + +# Return the perl script's exit code +exit $perl_exit_code diff --git a/devops/build/automation/cloudberry/scripts/test-cloudberry.sh b/devops/build/automation/cloudberry/scripts/test-cloudberry.sh new file mode 100755 index 00000000000..411f16ca625 --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/test-cloudberry.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Script: test-cloudberry.sh +# Description: Executes Apache Cloudberry test suite using specified +# make target. Supports different test types through make +# target configuration. Sources Cloudberry environment +# before running tests. +# +# Required Environment Variables: +# MAKE_TARGET - Make target to execute (e.g., installcheck-world) +# MAKE_DIRECTORY - Directory where make command will be executed +# MAKE_NAME - Name of the make operation (for logging) +# +# Optional Environment Variables: +# LOG_DIR - Directory for logs (defaults to build-logs) +# PGOPTIONS - PostgreSQL server options +# +# Usage: +# Export required variables: +# export MAKE_TARGET=installcheck-world +# export MAKE_DIRECTORY="/path/to/make/dir" +# export MAKE_NAME="Install Check" +# Then run: +# ./test-cloudberry.sh +# +# Exit Codes: +# 0 - All tests passed successfully +# 1 - Environment setup failed (missing required variables, environment sourcing failed) +# 2 - Test execution failed (make command returned error) +# +# -------------------------------------------------------------------- + +set -euo pipefail + +# Source common utilities +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}/cloudberry-utils.sh" + +# Define log directory and files +export LOG_DIR="build-logs" +TEST_LOG="${LOG_DIR}/test.log" + +# Initialize environment +init_environment "Cloudberry Test Script" "${TEST_LOG}" + +# Source Cloudberry environment +log_section "Environment Setup" +source_cloudberry_env || exit 1 +log_section_end "Environment Setup" + +echo "MAKE_TARGET: ${MAKE_TARGET}" +echo "MAKE_DIRECTORY: ${MAKE_DIRECTORY}" +echo "PGOPTIONS: ${PGOPTIONS}" + +# Execute specified target +log_section "Install Check" +execute_cmd make ${MAKE_TARGET} ${MAKE_DIRECTORY} || exit 2 +log_section_end "Install Check" + +# Log completion +log_completion "Cloudberry Test Script" "${TEST_LOG}" +exit 0 diff --git a/devops/build/automation/cloudberry/scripts/unittest-cloudberry.sh b/devops/build/automation/cloudberry/scripts/unittest-cloudberry.sh new file mode 100755 index 00000000000..97107ea1a9f --- /dev/null +++ b/devops/build/automation/cloudberry/scripts/unittest-cloudberry.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Script: unittest-cloudberry.sh +# Description: Executes unit tests for Apache Cloudberry from source +# code. Runs the 'unittest-check' make target and logs +# results. Tests are executed against the compiled source +# without requiring a full installation. +# +# Required Environment Variables: +# SRC_DIR - Root source directory +# +# Optional Environment Variables: +# LOG_DIR - Directory for logs (defaults to ${SRC_DIR}/build-logs) +# +# Usage: +# ./unittest-cloudberry.sh +# +# Exit Codes: +# 0 - All unit tests passed successfully +# 1 - Environment setup failed (missing SRC_DIR, LOG_DIR creation failed) +# 2 - Unit test execution failed +# +# -------------------------------------------------------------------- + +set -euo pipefail + +# Source common utilities +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}/cloudberry-utils.sh" + +# Define log directory and files +export LOG_DIR="${SRC_DIR}/build-logs" +UNITTEST_LOG="${LOG_DIR}/unittest.log" + +# Initialize environment +init_environment "Cloudberry Unittest Script" "${UNITTEST_LOG}" + +# Set environment +log_section "Environment Setup" +export LD_LIBRARY_PATH=${BUILD_DESTINATION}/lib:LD_LIBRARY_PATH +log_section_end "Environment Setup" + +# Unittest process +log_section "Unittest Process" +execute_cmd make --directory ${SRC_DIR}/../cloudberry unittest-check || exit 2 +log_section_end "Unittest Process" + +# Log completion +log_completion "Cloudberry Unittest Script" "${UNITTEST_LOG}" +exit 0 diff --git a/devops/build/packaging/deb/build-deb.sh b/devops/build/packaging/deb/build-deb.sh new file mode 100755 index 00000000000..2e7312be53f --- /dev/null +++ b/devops/build/packaging/deb/build-deb.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Script Name: build-deb.sh +# +# Description: +# This script automates the process of building an DEB package using a specified +# version number. It ensures that the necessary tools are installed +# and that the control file exists before attempting to build the DEB. The script +# also includes error handling to provide meaningful feedback in case of failure. +# +# Usage: +# ./build-deb.sh [-v ] [-h] [--dry-run] +# +# Options: +# -v, --version : Specify the version (required) +# -h, --help : Display this help and exit +# -n, --dry-run : Show what would be done, without making any changes +# +# Example: +# ./build-deb.sh -v 1.5.5 # Build with version 1.5.5 +# +# Prerequisites: +# - The dpkg-buildpackage package must be installed (provides the dpkg-buildpackage command). +# - The control file must exist at debian/control. +# +# Error Handling: +# The script includes checks to ensure: +# - The version option (-v or --version) is provided. +# - The necessary commands are available. +# - The control file exists at the specified location. +# If any of these checks fail, the script exits with an appropriate error message. + +# Enable strict mode for better error handling +set -euo pipefail + +# Default values +VERSION="" +RELEASE="1" +DEBUG_BUILD=false + +# Function to display usage information +usage() { + echo "Usage: $0 [-v ] [-h] [--dry-run]" + echo " -v, --version : Specify the version (optional)" + echo " -h, --help : Display this help and exit" + echo " -n, --dry-run : Show what would be done, without making any changes" + exit 1 +} + +# Function to check if required commands are available +check_commands() { + local cmds=("dpkg-buildpackage") + for cmd in "${cmds[@]}"; do + if ! command -v "$cmd" &> /dev/null; then + echo "Error: Required command '$cmd' not found. Please install it before running the script." + exit 1 + fi + done +} + +function print_changelog() { +cat < $(date +'%a, %d %b %Y %H:%M:%S %z') +EOF +} + +# Parse options +while [[ "$#" -gt 0 ]]; do + case $1 in + -v|--version) + VERSION="$2" + shift 2 + ;; + -h|--help) + usage + ;; + -n|--dry-run) + DRY_RUN=true + shift + ;; + *) + echo "Unknown option: ($1)" + shift + ;; + esac +done + +export CBDB_FULL_VERSION=$VERSION + +# Set version if not provided +if [ -z "${VERSION}" ]; then + export CBDB_FULL_VERSION=$(./getversion | cut -d'-' -f 1 | cut -d'+' -f 1) +fi + +if [[ ! $CBDB_FULL_VERSION =~ ^[0-9] ]]; then + export CBDB_FULL_VERSION="0.$CBDB_FULL_VERSION" +fi + +if [ -z ${BUILD_NUMBER+x} ]; then + export BUILD_NUMBER=1 +fi + +if [ -z ${BUILD_USER+x} ]; then + export BUILD_USER=github +fi + +export CBDB_PKG_VERSION=${CBDB_FULL_VERSION}-${BUILD_NUMBER}-$(git --git-dir=.git rev-list HEAD --count).$(git --git-dir=.git rev-parse --short HEAD) + +# Check if required commands are available +check_commands + +# Define the control file path +CONTROL_FILE=debian/control + +# Check if the spec file exists +if [ ! -f "$CONTROL_FILE" ]; then + echo "Error: Control file not found at $CONTROL_FILE." + exit 1 +fi + +# Build the rpmbuild command based on options +DEBBUILD_CMD="dpkg-buildpackage -us -uc" + +# Dry-run mode +if [ "${DRY_RUN:-false}" = true ]; then + echo "Dry-run mode: This is what would be done:" + print_changelog + echo "" + echo "$DEBBUILD_CMD" + exit 0 +fi + +# Run debbuild with the provided options +echo "Building DEB with Version $CBDB_FULL_VERSION ..." + +print_changelog > debian/changelog + +if ! eval "$DEBBUILD_CMD"; then + echo "Error: deb build failed." + exit 1 +fi + +# Print completion message +echo "DEB build completed successfully with package $CBDB_PKG_VERSION" diff --git a/devops/build/packaging/deb/ubuntu22.04/changelog b/devops/build/packaging/deb/ubuntu22.04/changelog new file mode 100644 index 00000000000..211d271b88f --- /dev/null +++ b/devops/build/packaging/deb/ubuntu22.04/changelog @@ -0,0 +1,5 @@ +apache-cloudberry-db-incubating (2.0.0) jammy; urgency=medium + + * Initial release. + + -- Cloudberry Team Wed, 26 Mar 2025 11:10:44 +0000 diff --git a/devops/build/packaging/deb/ubuntu22.04/compat b/devops/build/packaging/deb/ubuntu22.04/compat new file mode 100644 index 00000000000..ec635144f60 --- /dev/null +++ b/devops/build/packaging/deb/ubuntu22.04/compat @@ -0,0 +1 @@ +9 diff --git a/devops/build/packaging/deb/ubuntu22.04/control b/devops/build/packaging/deb/ubuntu22.04/control new file mode 100644 index 00000000000..70e4eda77d3 --- /dev/null +++ b/devops/build/packaging/deb/ubuntu22.04/control @@ -0,0 +1,107 @@ +Source: apache-cloudberry-db-incubating +Maintainer: Apache Cloudberry (Incubating) +Section: database +Build-Depends: debhelper (>= 9), + bison, + ca-certificates-java, + ca-certificates, + cmake, + curl, + cgroup-tools, + flex, + gcc-11, + g++-11, + git, + krb5-multidev, + libapr1-dev, + libbz2-dev, + libcurl4-gnutls-dev, + libevent-dev, + libkrb5-dev, + libldap2-dev, + libperl-dev, + libreadline6-dev, + libssl-dev, + libxml2-dev, + libyaml-dev, + libzstd-dev, + libaprutil1-dev, + libpam0g-dev, + libpam0g, + libcgroup1, + libyaml-0-2, + libldap-2.5-0, + libssl3, + libxerces-c-dev, + libxerces-c3.2, + ninja-build, + quilt, + unzip, + wget, + zlib1g-dev, + libuv1-dev + +Package: apache-cloudberry-db-incubating +Provides: apache-cloudberry-db +Architecture: amd64 +Depends: curl, + cgroup-tools, + iputils-ping, + iproute2, + keyutils, + krb5-multidev, + less, + libapr1, + libbz2-1.0, + libcurl4, + libcurl3-gnutls, + libevent-2.1-7, + libreadline8, + libxml2, + libyaml-0-2, + libldap-2.5-0, + libzstd1, + libcgroup1, + libssl3, + libpam0g, + libprotobuf23, + libpsl5, + libuv1, + liburing2, + libxerces-c3.2, + locales, + lsof, + lz4, + net-tools, + openssh-client, + openssh-server, + openssl, + python3, + rsync, + wget, + xz-utils, + zlib1g +Description: Apache Cloudberry (incubating) is an advanced, open-source, massively + parallel processing (MPP) data warehouse developed from PostgreSQL and + Greenplum. It is designed for high-performance analytics on + large-scale data sets, offering powerful analytical capabilities and + enhanced security features. + Key Features: + - Massively parallel processing for optimized performance + - Advanced analytics for complex data processing + - Integration with ETL and BI tools + - Compatibility with multiple data sources and formats + - Enhanced security features + Apache Cloudberry supports both batch processing and real-time data + warehousing, making it a versatile solution for modern data + environments. + Apache Cloudberry (incubating) is an effort undergoing incubation at + the Apache Software Foundation (ASF), sponsored by the Apache + Incubator PMC. + Incubation is required of all newly accepted projects until a further + review indicates that the infrastructure, communications, and decision + making process have stabilized in a manner consistent with other + successful ASF projects. + While incubation status is not necessarily a reflection of the + completeness or stability of the code, it does indicate that the + project has yet to be fully endorsed by the ASF. diff --git a/devops/build/packaging/deb/ubuntu22.04/install b/devops/build/packaging/deb/ubuntu22.04/install new file mode 100644 index 00000000000..3e29bb0dd35 --- /dev/null +++ b/devops/build/packaging/deb/ubuntu22.04/install @@ -0,0 +1 @@ +debian/build/* /usr/cloudberry-db diff --git a/devops/build/packaging/deb/ubuntu22.04/postinst b/devops/build/packaging/deb/ubuntu22.04/postinst new file mode 100644 index 00000000000..ccb33a54033 --- /dev/null +++ b/devops/build/packaging/deb/ubuntu22.04/postinst @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +GPADMIN=gpadmin +GPHOME=/usr/cloudberry-db + +if [ "$1" = configure ]; then + + ln -sf ${GPHOME} /usr/local/cloudberry-db + +fi diff --git a/devops/build/packaging/deb/ubuntu22.04/rules b/devops/build/packaging/deb/ubuntu22.04/rules new file mode 100755 index 00000000000..6213985b48c --- /dev/null +++ b/devops/build/packaging/deb/ubuntu22.04/rules @@ -0,0 +1,49 @@ +#!/usr/bin/make -f + +DH_VERBOSE = 1 +DPKG_EXPORT_BUILDFLAGS = 1 + +CBDB_BIN_PATH := /usr/local/cloudberry-db +DEBIAN_DESTINATION := $(shell pwd)/debian/build + +# assumes that CWD is root of cbdb source +CBDB_PKG_VERSION := $(CBDB_PKG_VERSION) +PACKAGE_CBDB := $(shell cat debian/control | egrep "^Package: " | cut -d " " -f 2) +PATH := ${DEBIAN_DESTINATION}/bin:${PATH} + +.PHONY: gpinstall + +include /usr/share/dpkg/default.mk + +%: + dh $@ --parallel + +gpinstall: + make install + +override_dh_auto_install: gpinstall + # the staging directory for creating a debian is NOT the right GPHOME. + # change GPHOME to point to the post-install target install directory. + sed -i "s#GPHOME=.*#GPHOME=${CBDB_BIN_PATH}#g" ${DEBIAN_DESTINATION}/cloudberry-env.sh + +override_dh_auto_build: + echo "Skipping build" + +override_dh_auto_clean: + echo "Skipping clean" + +override_dh_auto_configure: + echo "Skipping configure" + +override_dh_auto_test: + echo "Skipping auto test" + +override_dh_gencontrol: + echo "using version ${CBDB_PKG_VERSION} for binary Cloudberry" + dh_gencontrol -- -v${CBDB_PKG_VERSION} -p${PACKAGE_CBDB} + +override_dh_shlibdeps: + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/x86_64-linux-gnu/libfakeroot:${DEBIAN_DESTINATION}/lib dh_shlibdeps --dpkg-shlibdeps-params=--ignore-missing-info + +clean_dev_local: + rm -rf ${DEBIAN_DESTINATION} diff --git a/devops/build/packaging/deb/ubuntu22.04/source/format b/devops/build/packaging/deb/ubuntu22.04/source/format new file mode 100644 index 00000000000..89ae9db8f88 --- /dev/null +++ b/devops/build/packaging/deb/ubuntu22.04/source/format @@ -0,0 +1 @@ +3.0 (native) diff --git a/devops/build/packaging/deb/ubuntu22.04/source/local-options b/devops/build/packaging/deb/ubuntu22.04/source/local-options new file mode 100644 index 00000000000..00131ee8c41 --- /dev/null +++ b/devops/build/packaging/deb/ubuntu22.04/source/local-options @@ -0,0 +1,2 @@ +#abort-on-upstream-changes +#unapply-patches diff --git a/devops/build/packaging/rpm/apache-cloudberry-db-incubating.spec b/devops/build/packaging/rpm/apache-cloudberry-db-incubating.spec new file mode 100644 index 00000000000..03fa0a34570 --- /dev/null +++ b/devops/build/packaging/rpm/apache-cloudberry-db-incubating.spec @@ -0,0 +1,178 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +%define cloudberry_install_dir /usr/local/cloudberry-db + +# Add at the top of the spec file +# Default to non-debug build +%bcond_with debug + +# Conditional stripping based on debug flag +%if %{with debug} +%define __os_install_post %{nil} +%define __strip /bin/true +%endif + +Name: apache-cloudberry-db-incubating +Version: %{version} +# In the release definition section +%if %{with debug} +Release: %{release}.debug%{?dist} +%else +Release: %{release}%{?dist} +%endif +Summary: High-performance, open-source data warehouse based on PostgreSQL/Greenplum + +License: ASL 2.0 +URL: https://cloudberry.apache.org +Vendor: Apache Cloudberry (Incubating) +Group: Applications/Databases +Prefix: %{cloudberry_install_dir} + +# Disabled as we are shipping GO programs (e.g. gpbackup) +%define _missing_build_ids_terminate_build 0 + +# Disable debugsource files +%define _debugsource_template %{nil} + +# List runtime dependencies + +Requires: bash +Requires: iproute +Requires: iputils +Requires: openssh +Requires: openssh-clients +Requires: openssh-server +Requires: rsync + +%if 0%{?rhel} == 8 +Requires: apr +Requires: audit +Requires: bzip2 +Requires: keyutils +Requires: libcurl +Requires: libevent +Requires: libidn2 +Requires: libselinux +Requires: libstdc++ +Requires: libuuid +Requires: libuv +Requires: libxml2 +Requires: libyaml +Requires: libzstd +Requires: lz4 +Requires: openldap +Requires: pam +Requires: perl +Requires: python3 +Requires: readline +%endif + +%if 0%{?rhel} == 9 +Requires: apr +Requires: bzip2 +Requires: glibc +Requires: keyutils +Requires: libcap +Requires: libcurl +Requires: libidn2 +Requires: libpsl +Requires: libssh +Requires: libstdc++ +Requires: libxml2 +Requires: libyaml +Requires: libzstd +Requires: lz4 +Requires: openldap +Requires: pam +Requires: pcre2 +Requires: perl +Requires: readline +Requires: xz +%endif + +%description + +Apache Cloudberry (Incubating) is an advanced, open-source, massively +parallel processing (MPP) data warehouse developed from PostgreSQL and +Greenplum. It is designed for high-performance analytics on +large-scale data sets, offering powerful analytical capabilities and +enhanced security features. + +Key Features: + +- Massively parallel processing for optimized performance +- Advanced analytics for complex data processing +- Integration with ETL and BI tools +- Compatibility with multiple data sources and formats +- Enhanced security features + +Apache Cloudberry supports both batch processing and real-time data +warehousing, making it a versatile solution for modern data +environments. + +Apache Cloudberry (Incubating) is an effort undergoing incubation at +the Apache Software Foundation (ASF), sponsored by the Apache +Incubator PMC. + +Incubation is required of all newly accepted projects until a further +review indicates that the infrastructure, communications, and decision +making process have stabilized in a manner consistent with other +successful ASF projects. + +While incubation status is not necessarily a reflection of the +completeness or stability of the code, it does indicate that the +project has yet to be fully endorsed by the ASF. + +%prep +# No prep needed for binary RPM + +%build +# No prep needed for binary RPM + +%install +rm -rf %{buildroot} + +# Create the versioned directory +mkdir -p %{buildroot}%{cloudberry_install_dir}-%{version} + +cp -R %{cloudberry_install_dir}/* %{buildroot}%{cloudberry_install_dir}-%{version} + +# Create the symbolic link +ln -sfn %{cloudberry_install_dir}-%{version} %{buildroot}%{cloudberry_install_dir} + +%files +%{prefix}-%{version} +%{prefix} + +%license %{cloudberry_install_dir}-%{version}/LICENSE + +%debug_package + +%post +# Change ownership to gpadmin.gpadmin if the gpadmin user exists +if id "gpadmin" &>/dev/null; then + chown -R gpadmin:gpadmin %{cloudberry_install_dir}-%{version} + chown gpadmin:gpadmin %{cloudberry_install_dir} +fi + +%postun +if [ $1 -eq 0 ] ; then + if [ "$(readlink -f "%{cloudberry_install_dir}")" == "%{cloudberry_install_dir}-%{version}" ]; then + unlink "%{cloudberry_install_dir}" || true + fi +fi diff --git a/devops/build/packaging/rpm/apache-cloudberry-hll-incubating.spec b/devops/build/packaging/rpm/apache-cloudberry-hll-incubating.spec new file mode 100644 index 00000000000..4d2d6126f74 --- /dev/null +++ b/devops/build/packaging/rpm/apache-cloudberry-hll-incubating.spec @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +%global cloudberry_version %{?_cloudberry_version}%{!?_cloudberry_version:1.6} +%global cloudberry_install_dir /usr/local/cloudberry-db + +Name: apache-cloudberry-hll-incubating +Version: 2.18.0 +Release: %{?release}%{!?release:1} +Summary: HyperLogLog extension for Apache Cloudberry %{cloudberry_version} +License: ASL 2.0 +URL: https://github.com/citusdata/postgresql-hll +Vendor: Apache Cloudberry (Incubating) +Group: Applications/Databases +BuildArch: x86_64 +Requires: apache-cloudberry-db-incubating >= %{cloudberry_version} +Prefix: %{cloudberry_install_dir} + +%description +HLL is an open-source PostgreSQL extension (compatible with Apache +Cloudberry (Incubating) %{cloudberry_version}) adding HyperLogLog data +structures as a native data type. HyperLogLog is a fixed-size, +set-like structure used for distinct value counting with tunable +precision. + +%prep +# No prep needed for binary RPM + +%build +# No build needed for binary RPM + +%install +mkdir -p %{buildroot}%{prefix}/lib/postgresql \ + %{buildroot}%{prefix}/share/postgresql/extension + +cp -R %{cloudberry_install_dir}/lib/postgresql/hll.so \ + %{buildroot}%{prefix}/lib/postgresql/hll.so + +cp -R %{cloudberry_install_dir}/share/postgresql/extension/hll* \ + %{buildroot}%{prefix}/share/postgresql/extension + +%files +%{prefix}/lib/postgresql/hll.so +%{prefix}/share/postgresql/extension/hll--*.sql +%{prefix}/share/postgresql/extension/hll.control + +%post +echo "HLL extension for Apache Cloudberry %{cloudberry_version} has been installed in %{prefix}." +echo "To enable it in a database, run:" +echo " CREATE EXTENSION hll;" + +%postun +echo "HLL extension for Apache Cloudberry %{cloudberry_version} has been removed from %{prefix}." +echo "You may need to manually clean up any database objects that were using the extension." diff --git a/devops/build/packaging/rpm/apache-cloudberry-pgvector-incubating.spec b/devops/build/packaging/rpm/apache-cloudberry-pgvector-incubating.spec new file mode 100644 index 00000000000..0be44308e07 --- /dev/null +++ b/devops/build/packaging/rpm/apache-cloudberry-pgvector-incubating.spec @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +%global cloudberry_version %{?_cloudberry_version}%{!?_cloudberry_version:1.6} +%global cloudberry_install_dir /usr/local/cloudberry-db +%global pgvector_version %{?_pgvector_version}%{!?_pgvector_version:0.5.1} + +Name: cloudberry-pgvector +Version: %{pgvector_version} +Release: %{?release}%{!?release:1} +Summary: pgvector extension for Apache Cloudberry %{cloudberry_version} +License: PostgreSQL +URL: https://github.com/pgvector/pgvector +Vendor: Cloudberry Open Source +Group: Applications/Databases +BuildArch: x86_64 +Requires: cloudberry-db >= %{cloudberry_version} +Prefix: %{cloudberry_install_dir} + +%description +pgvector is an open-source vector similarity search extension for +PostgreSQL and Apache Cloudberry %{cloudberry_version}. It provides +vector data types and vector similarity search functions, allowing for +efficient similarity search operations on high-dimensional data. + +%prep +# No prep needed for binary RPM + +%build +# No build needed for binary RPM + +%install +mkdir -p %{buildroot}%{prefix}/include/postgresql/server/extension/vector \ + %{buildroot}%{prefix}/lib/postgresql \ + %{buildroot}%{prefix}/share/postgresql/extension +cp -R %{cloudberry_install_dir}/include/postgresql/server/extension/vector/* \ + %{buildroot}%{prefix}/include/postgresql/server/extension/vector +cp -R %{cloudberry_install_dir}/lib/postgresql/vector.so \ + %{buildroot}%{prefix}/lib/postgresql/vector.so +cp -R %{cloudberry_install_dir}/share/postgresql/extension/vector* \ + %{buildroot}%{prefix}/share/postgresql/extension + +%files +%{prefix}/include/postgresql/server/extension/vector/* +%{prefix}/lib/postgresql/vector.so +%{prefix}/share/postgresql/extension/vector--*.sql +%{prefix}/share/postgresql/extension/vector.control + +%post +echo "pgvector extension version %{version} for Apache Cloudberry %{cloudberry_version} has been installed in %{prefix}." +echo "To enable it in a database, run:" +echo " CREATE EXTENSION vector;" + +%postun +echo "pgvector extension version %{version} for Apache Cloudberry %{cloudberry_version} has been removed from %{prefix}." +echo "You may need to manually clean up any database objects that were using the extension." diff --git a/devops/build/packaging/rpm/build-rpm.sh b/devops/build/packaging/rpm/build-rpm.sh new file mode 100755 index 00000000000..ceb7d18d392 --- /dev/null +++ b/devops/build/packaging/rpm/build-rpm.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Script Name: build-rpm.sh +# +# Description: +# This script automates the process of building an RPM package using a specified +# version and release number. It ensures that the necessary tools are installed +# and that the spec file exists before attempting to build the RPM. The script +# also includes error handling to provide meaningful feedback in case of failure. +# +# Usage: +# ./build-rpm.sh -v [-r ] [-d|--with-debug] [-h] [--dry-run] +# +# Options: +# -v, --version : Specify the version (required) +# -r, --release : Specify the release (optional, default is 1) +# -d, --with-debug : Build with debug symbols (optional) +# -h, --help : Display this help and exit +# -n, --dry-run : Show what would be done, without making any changes +# +# Example: +# ./build-rpm.sh -v 1.5.5 -r 2 # Build with version 1.5.5 and release 2 +# ./build-rpm.sh -v 1.5.5 # Build with version 1.5.5 and default release 1 +# ./build-rpm.sh -v 1.5.5 --with-debug # Build with debug symbols +# +# Prerequisites: +# - The rpm-build package must be installed (provides the rpmbuild command). +# - The spec file must exist at ~/rpmbuild/SPECS/apache-cloudberry-db-incubating.spec. +# +# Error Handling: +# The script includes checks to ensure: +# - The version option (-v or --version) is provided. +# - The necessary commands are available. +# - The spec file exists at the specified location. +# If any of these checks fail, the script exits with an appropriate error message. + +# Enable strict mode for better error handling +set -euo pipefail + +# Default values +VERSION="" +RELEASE="1" +DEBUG_BUILD=false + +# Function to display usage information +usage() { + echo "Usage: $0 -v [-r ] [-h] [--dry-run]" + echo " -v, --version : Specify the version (required)" + echo " -r, --release : Specify the release (optional, default is 1)" + echo " -d, --with-debug : Build with debug symbols (optional)" + echo " -h, --help : Display this help and exit" + echo " -n, --dry-run : Show what would be done, without making any changes" + exit 1 +} + +# Function to check if required commands are available +check_commands() { + local cmds=("rpmbuild") + for cmd in "${cmds[@]}"; do + if ! command -v "$cmd" &> /dev/null; then + echo "Error: Required command '$cmd' not found. Please install it before running the script." + exit 1 + fi + done +} + +# Parse options +while [[ "$#" -gt 0 ]]; do + case $1 in + -v|--version) + VERSION="$2" + shift 2 + ;; + -r|--release) + RELEASE="$2" + shift 2 + ;; + -d|--with-debug) + DEBUG_BUILD=true + shift + ;; + -h|--help) + usage + ;; + -n|--dry-run) + DRY_RUN=true + shift + ;; + *) + echo "Unknown option: ($1)" + shift + ;; + esac +done + +# Ensure version is provided +if [ -z "$VERSION" ]; then + echo "Error: Version (-v or --version) is required." + usage +fi + +# Check if required commands are available +check_commands + +# Define the spec file path +SPEC_FILE=~/rpmbuild/SPECS/apache-cloudberry-db-incubating.spec + +# Check if the spec file exists +if [ ! -f "$SPEC_FILE" ]; then + echo "Error: Spec file not found at $SPEC_FILE." + exit 1 +fi + +# Build the rpmbuild command based on options +RPMBUILD_CMD="rpmbuild -bb \"$SPEC_FILE\" --define \"version $VERSION\" --define \"release $RELEASE\"" +if [ "$DEBUG_BUILD" = true ]; then + RPMBUILD_CMD+=" --with debug" +fi + +# Dry-run mode +if [ "${DRY_RUN:-false}" = true ]; then + echo "Dry-run mode: This is what would be done:" + echo " $RPMBUILD_CMD" + exit 0 +fi + +# Run rpmbuild with the provided options +echo "Building RPM with Version: $VERSION, Release: $RELEASE$([ "$DEBUG_BUILD" = true ] && echo ", Debug: enabled")..." +if ! eval "$RPMBUILD_CMD"; then + echo "Error: rpmbuild failed." + exit 1 +fi + +# Print completion message +echo "RPM build completed successfully with Version: $VERSION, Release: $RELEASE" diff --git a/devops/build/packaging/rpm/cloudberry-dev-repo.spec b/devops/build/packaging/rpm/cloudberry-dev-repo.spec new file mode 100644 index 00000000000..45f6186cf3d --- /dev/null +++ b/devops/build/packaging/rpm/cloudberry-dev-repo.spec @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +Name: cloudberry-dev-repo +Version: 1.0 +Release: 1%{?dist} +Summary: Apache Cloudberry Repository Configuration +License: ASL 2.0 +Group: Applications/Databases +URL: https://cloudberry.apache.org +Vendor: Cloudberry Open Source +BuildArch: noarch + +%description +This package configures the Apache Cloudberry repository on your +system. Apache Cloudberry is an open-source project aimed at +providing a scalable, high-performance SQL database for +analytics. This repository provides access to the latest RPM packages +for Apache Cloudberry, allowing you to easily install and stay +up-to-date with the latest developments. + +%install +mkdir -p %{buildroot}%{_sysconfdir}/yum.repos.d/ +cat > %{buildroot}%{_sysconfdir}/yum.repos.d/cloudberry-dev.repo < /dev/null + +# -------------------------------------------------------------------- +# Copy Configuration Files and Setup the Environment +# -------------------------------------------------------------------- +# - Copy custom configuration files from the build context to /tmp/. +# - Apply custom system limits and timezone. +# - Create and configure the 'gpadmin' user with sudo privileges. +# - Set up SSH for password-based authentication. +# - Generate locale and set the default locale to en_US.UTF-8. +# -------------------------------------------------------------------- +COPY ./configs/* /tmp/ + +RUN cp /tmp/90-cbdb-limits /etc/security/limits.d/90-cbdb-limits && \ + sed -i.bak -r 's/^(session\s+required\s+pam_limits.so)/#\1/' /etc/pam.d/* && \ + cat /usr/share/zoneinfo/${TIMEZONE_VAR} > /etc/localtime && \ + chmod 777 /tmp/init_system.sh && \ + /usr/sbin/groupadd gpadmin && \ + /usr/sbin/useradd gpadmin -g gpadmin -G wheel && \ + setcap cap_net_raw+ep /usr/bin/ping && \ + echo 'gpadmin ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/90-gpadmin && \ + echo -e '\n# Add Cloudberry entries\nif [ -f /usr/local/cbdb/cloudberry-env.sh ]; then\n source /usr/local/cbdb/cloudberry-env.sh\nfi' >> /home/gpadmin/.bashrc && \ + ssh-keygen -A && \ + echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ + localedef -i en_US -f UTF-8 en_US.UTF-8 && \ + echo "LANG=en_US.UTF-8" | tee /etc/locale.conf && \ + dnf clean all # Final cleanup to remove unnecessary files + +# Install testinfra via pip +RUN pip3 install pytest-testinfra + +# Example: Copying test files into the container +COPY tests /tests + +# -------------------------------------------------------------------- +# Set the Default User and Command +# -------------------------------------------------------------------- +# The default user is set to 'gpadmin', and the container starts by +# running the init_system.sh script. The container also mounts the +# /sys/fs/cgroup volume for systemd compatibility. +# -------------------------------------------------------------------- +USER gpadmin + +VOLUME [ "/sys/fs/cgroup" ] +CMD ["bash","-c","/tmp/init_system.sh"] diff --git a/devops/deploy/docker/build/rocky8/configs/90-cbdb-limits b/devops/deploy/docker/build/rocky8/configs/90-cbdb-limits new file mode 100644 index 00000000000..474957c42f6 --- /dev/null +++ b/devops/deploy/docker/build/rocky8/configs/90-cbdb-limits @@ -0,0 +1,32 @@ +# /etc/security/limits.d/90-db-limits +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +# Core dump file size limits for gpadmin +gpadmin soft core unlimited +gpadmin hard core unlimited + +# Open file limits for gpadmin +gpadmin soft nofile 524288 +gpadmin hard nofile 524288 + +# Process limits for gpadmin +gpadmin soft nproc 131072 +gpadmin hard nproc 131072 diff --git a/devops/deploy/docker/build/rocky8/configs/gpinitsystem.conf b/devops/deploy/docker/build/rocky8/configs/gpinitsystem.conf new file mode 100644 index 00000000000..3c0fb48b58c --- /dev/null +++ b/devops/deploy/docker/build/rocky8/configs/gpinitsystem.conf @@ -0,0 +1,87 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# gpinitsystem Configuration File for Apache Cloudberry +# -------------------------------------------------------------------- +# This configuration file is used to initialize an Apache Cloudberry +# cluster. It defines the settings for the coordinator, primary segments, +# and mirrors, as well as other important configuration options. +# -------------------------------------------------------------------- + +# Segment prefix - This prefix is used for naming the segment directories. +# For example, the primary segment directories will be named gpseg0, gpseg1, etc. +SEG_PREFIX=gpseg + +# Coordinator port - The port number where the coordinator will listen. +# This is the port used by clients to connect to the database. +COORDINATOR_PORT=5432 + +# Coordinator hostname - The hostname of the machine where the coordinator +# will be running. The $(hostname) command will automatically insert the +# hostname of the current machine. +COORDINATOR_HOSTNAME=$(hostname) + +# Coordinator data directory - The directory where the coordinator's data +# will be stored. This directory should have enough space to store metadata +# and system catalogs. +COORDINATOR_DIRECTORY=/data1/coordinator + +# Base port for primary segments - The starting port number for the primary +# segments. Each primary segment will use a unique port number starting from +# this base. +PORT_BASE=6000 + +# Primary segment data directories - An array specifying the directories where +# the primary segment data will be stored. Each directory corresponds to a +# primary segment. In this case, two primary segments will be created in the +# same directory. +declare -a DATA_DIRECTORY=(/data1/primary /data1/primary) + +# Base port for mirror segments - The starting port number for the mirror +# segments. Each mirror segment will use a unique port number starting from +# this base. +MIRROR_PORT_BASE=7000 + +# Mirror segment data directories - An array specifying the directories where +# the mirror segment data will be stored. Each directory corresponds to a +# mirror segment. In this case, two mirror segments will be created in the +# same directory. +declare -a MIRROR_DATA_DIRECTORY=(/data1/mirror /data1/mirror) + +# Trusted shell - The shell program used for remote execution. Cloudberry uses +# SSH to run commands on other machines in the cluster. 'ssh' is the default. +TRUSTED_SHELL=ssh + +# Database encoding - The character set encoding to be used by the database. +# 'UNICODE' is a common choice, especially for internationalization. +ENCODING=UNICODE + +# Default database name - The name of the default database to be created during +# initialization. This is also the default database that the gpadmin user will +# connect to. +DATABASE_NAME=gpadmin + +# Machine list file - A file containing the list of hostnames where the primary +# segments will be created. Each line in the file represents a different machine. +# This file is critical for setting up the cluster across multiple nodes. +MACHINE_LIST_FILE=/home/gpadmin/hostfile_gpinitsystem + +# -------------------------------------------------------------------- +# End of gpinitsystem Configuration File +# -------------------------------------------------------------------- diff --git a/devops/deploy/docker/build/rocky8/configs/init_system.sh b/devops/deploy/docker/build/rocky8/configs/init_system.sh new file mode 100755 index 00000000000..cc2d5991b9d --- /dev/null +++ b/devops/deploy/docker/build/rocky8/configs/init_system.sh @@ -0,0 +1,193 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# -------------------------------------------------------------------- +# Container Initialization Script +# -------------------------------------------------------------------- +# This script sets up the environment inside the Docker container for +# the Apache Cloudberry Build Environment. It performs the following +# tasks: +# +# 1. Verifies that the container is running with the expected hostname. +# 2. Starts the SSH daemon to allow SSH access to the container. +# 3. Configures passwordless SSH access for the 'gpadmin' user. +# 4. Displays a welcome banner and system information. +# 5. Starts an interactive bash shell. +# +# This script is intended to be used as an entrypoint or initialization +# script for the Docker container. +# -------------------------------------------------------------------- + +# -------------------------------------------------------------------- +# Check if the hostname is 'cdw' +# -------------------------------------------------------------------- +# The script checks if the container's hostname is set to 'cdw'. This is +# a requirement for this environment, and if the hostname does not match, +# the script will exit with an error message. This ensures consistency +# across different environments. +# -------------------------------------------------------------------- +if [ "$(hostname)" != "cdw" ]; then + echo "Error: This container must be run with the hostname 'cdw'." + echo "Use the following command: docker run -h cdw ..." + exit 1 +fi + +# -------------------------------------------------------------------- +# Start SSH daemon and setup for SSH access +# -------------------------------------------------------------------- +# The SSH daemon is started to allow remote access to the container via +# SSH. This is useful for development and debugging purposes. If the SSH +# daemon fails to start, the script exits with an error. +# -------------------------------------------------------------------- +if ! sudo /usr/sbin/sshd; then + echo "Failed to start SSH daemon" >&2 + exit 1 +fi + +# -------------------------------------------------------------------- +# Remove /run/nologin to allow logins +# -------------------------------------------------------------------- +# The /run/nologin file, if present, prevents users from logging into +# the system. This file is removed to ensure that users can log in via SSH. +# -------------------------------------------------------------------- +sudo rm -rf /run/nologin + +# -------------------------------------------------------------------- +# Configure passwordless SSH access for 'gpadmin' user +# -------------------------------------------------------------------- +# The script sets up SSH key-based authentication for the 'gpadmin' user, +# allowing passwordless SSH access. It generates a new SSH key pair if one +# does not already exist, and configures the necessary permissions. +# -------------------------------------------------------------------- +mkdir -p /home/gpadmin/.ssh +chmod 700 /home/gpadmin/.ssh + +if [ ! -f /home/gpadmin/.ssh/id_rsa ]; then + ssh-keygen -t rsa -b 4096 -C gpadmin -f /home/gpadmin/.ssh/id_rsa -P "" > /dev/null 2>&1 +fi + +cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys +chmod 600 /home/gpadmin/.ssh/authorized_keys + +# Add the container's hostname to the known_hosts file to avoid SSH warnings +ssh-keyscan -t rsa cdw > /home/gpadmin/.ssh/known_hosts 2>/dev/null + +# Change to the home directory of the current user +cd $HOME + +# -------------------------------------------------------------------- +# Display a Welcome Banner +# -------------------------------------------------------------------- +# The following ASCII art and welcome message are displayed when the +# container starts. This banner provides a visual indication that the +# container is running in the Apache Cloudberry Build Environment. +# -------------------------------------------------------------------- +cat <<-'EOF' + +====================================================================== + + ++++++++++ ++++++ + ++++++++++++++ +++++++ + ++++ +++++ ++++ + ++++ +++++++++ + =+==== =============+ + ======== =====+ ===== + ==== ==== ==== ==== + ==== === === ==== + ==== === === ==== + ==== === ==-- === + ===== ===== -- ==== + ===================== ====== + ============================ + =-----= + ____ _ _ _ + / ___|| | ___ _ _ __| || |__ ___ _ __ _ __ _ _ + | | | | / _ \ | | | | / _` || '_ \ / _ \| '__|| '__|| | | | + | |___ | || (_) || |_| || (_| || |_) || __/| | | | | |_| | + \____||_| \____ \__,_| \__,_||_.__/ \___||_| |_| \__, | + |___/ +---------------------------------------------------------------------- + +EOF + +# -------------------------------------------------------------------- +# Display System Information +# -------------------------------------------------------------------- +# The script sources the /etc/os-release file to retrieve the operating +# system name and version. It then displays the following information: +# - OS name and version +# - Current user +# - Container hostname +# - IP address +# - CPU model name and number of cores +# - Total memory available +# This information is useful for users to understand the environment they +# are working in. +# -------------------------------------------------------------------- +source /etc/os-release + +# First, create the CPU info detection function +get_cpu_info() { + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + lscpu | grep 'Model name:' | awk '{print substr($0, index($0,$3))}' + elif [ "$ARCH" = "aarch64" ]; then + VENDOR=$(lscpu | grep 'Vendor ID:' | awk '{print $3}') + if [ "$VENDOR" = "Apple" ] || [ "$VENDOR" = "0x61" ]; then + echo "Apple Silicon ($ARCH)" + else + if [ -f /proc/cpuinfo ]; then + IMPL=$(grep "CPU implementer" /proc/cpuinfo | head -1 | awk '{print $3}') + PART=$(grep "CPU part" /proc/cpuinfo | head -1 | awk '{print $3}') + if [ ! -z "$IMPL" ] && [ ! -z "$PART" ]; then + echo "ARM $ARCH (Implementer: $IMPL, Part: $PART)" + else + echo "ARM $ARCH" + fi + else + echo "ARM $ARCH" + fi + fi + else + echo "Unknown architecture: $ARCH" + fi +} + +cat <<-EOF +Welcome to the Apache Cloudberry Build Environment! + +Container OS ........ : $NAME $VERSION +User ................ : $(whoami) +Container hostname .. : $(hostname) +IP Address .......... : $(hostname -I | awk '{print $1}') +CPU Info ............ : $(get_cpu_info) +CPU(s) .............. : $(nproc) +Memory .............. : $(free -h | grep Mem: | awk '{print $2}') total +====================================================================== + +EOF + +# -------------------------------------------------------------------- +# Start an interactive bash shell +# -------------------------------------------------------------------- +# Finally, the script starts an interactive bash shell to keep the +# container running and allow the user to interact with the environment. +# -------------------------------------------------------------------- +/bin/bash diff --git a/devops/deploy/docker/build/rocky8/tests/requirements.txt b/devops/deploy/docker/build/rocky8/tests/requirements.txt new file mode 100644 index 00000000000..b9711eddac5 --- /dev/null +++ b/devops/deploy/docker/build/rocky8/tests/requirements.txt @@ -0,0 +1,3 @@ +testinfra +pytest-testinfra +paramiko diff --git a/devops/deploy/docker/build/rocky8/tests/testinfra/test_cloudberry_db_env.py b/devops/deploy/docker/build/rocky8/tests/testinfra/test_cloudberry_db_env.py new file mode 100644 index 00000000000..c484c5b9408 --- /dev/null +++ b/devops/deploy/docker/build/rocky8/tests/testinfra/test_cloudberry_db_env.py @@ -0,0 +1,126 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +import testinfra + +def test_installed_packages(host): + """ + Test if the essential packages are installed. + """ + packages = [ + "epel-release", + "git", + "the_silver_searcher", + "htop", + "bison", + "gcc", + "gcc-c++", + "glibc-langpack-en", + "glibc-locale-source", + "openssh-clients", + "openssh-server", + "sudo", + "rsync", + "wget", + "openssl-devel", + "python36-devel", + "readline-devel", + "zlib-devel", + "libcurl-devel", + "libevent-devel", + "libxml2-devel", + "libuuid-devel", + "libzstd-devel", + "lz4", + "openldap-devel", + "libuv-devel", + "libyaml-devel" + ] + for package in packages: + pkg = host.package(package) + assert pkg.is_installed + + +def test_user_gpadmin_exists(host): + """ + Test if the gpadmin user exists and is configured properly. + """ + user = host.user("gpadmin") + assert user.exists + assert "wheel" in user.groups + + +def test_ssh_service(host): + """ + Test if SSH service is configured correctly. + """ + sshd_config = host.file("/etc/ssh/sshd_config") + assert sshd_config.exists + + +def test_locale_configured(host): + """ + Test if the locale is configured correctly. + """ + locale_conf = host.file("/etc/locale.conf") + assert locale_conf.exists + assert locale_conf.contains("LANG=en_US.UTF-8") + + +def test_timezone(host): + """ + Test if the timezone is configured correctly. + """ + localtime = host.file("/etc/localtime") + assert localtime.exists + + +def test_system_limits_configured(host): + """ + Test if the custom system limits are applied. + """ + limits_file = host.file("/etc/security/limits.d/90-cbdb-limits") + assert limits_file.exists + + +def test_init_system_script(host): + """ + Test if the init_system.sh script is present and executable. + """ + script = host.file("/tmp/init_system.sh") + assert script.exists + assert script.mode == 0o777 + + +def test_custom_configuration_files(host): + """ + Test if custom configuration files are correctly copied. + """ + config_file = host.file("/tmp/90-cbdb-limits") + assert config_file.exists + + +def test_locale_generated(host): + """ + Test if the en_US.UTF-8 locale is correctly generated. + """ + locale = host.run("locale -a | grep en_US.utf8") + assert locale.exit_status == 0 + assert "en_US.utf8" in locale.stdout diff --git a/devops/deploy/docker/build/rocky9/Dockerfile b/devops/deploy/docker/build/rocky9/Dockerfile new file mode 100644 index 00000000000..26190109ef0 --- /dev/null +++ b/devops/deploy/docker/build/rocky9/Dockerfile @@ -0,0 +1,218 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Apache Cloudberry (Incubating) is an effort undergoing incubation at +# the Apache Software Foundation (ASF), sponsored by the Apache +# Incubator PMC. +# +# Incubation is required of all newly accepted projects until a +# further review indicates that the infrastructure, communications, +# and decision making process have stabilized in a manner consistent +# with other successful ASF projects. +# +# While incubation status is not necessarily a reflection of the +# completeness or stability of the code, it does indicate that the +# project has yet to be fully endorsed by the ASF. +# +# -------------------------------------------------------------------- +# Dockerfile for Apache Cloudberry Build Environment +# -------------------------------------------------------------------- +# This Dockerfile sets up a Rocky Linux 9-based container for building +# and developing Apache Cloudberry. It installs necessary system +# utilities, development tools, and configures the environment for SSH +# access and systemd support. +# +# Key Features: +# - Locale setup for en_US.UTF-8 +# - SSH daemon setup for remote access +# - Essential development tools and libraries installation +# - User configuration for 'gpadmin' with sudo privileges +# +# Usage: +# docker build -t cloudberry-db-env . +# docker run -h cdw -it cloudberry-db-env +# -------------------------------------------------------------------- + +# Base image: Rocky Linux 9 +FROM rockylinux/rockylinux:9 + +# Argument for configuring the timezone +ARG TIMEZONE_VAR="America/Los_Angeles" + +# Environment variables for locale and user +ENV container=docker +ENV LANG=en_US.UTF-8 +ENV USER=gpadmin + +# -------------------------------------------------------------------- +# Install Development Tools and Utilities +# -------------------------------------------------------------------- +# Install various development tools, system utilities, and libraries +# required for building and running Apache Cloudberry. +# - EPEL repository is enabled for additional packages. +# - Cleanup steps are added to reduce image size after installation. +# -------------------------------------------------------------------- +RUN dnf makecache && \ + dnf install -y \ + epel-release \ + rocky-release-hpc \ + git && \ + dnf config-manager --disable epel-cisco-openh264 && \ + dnf makecache && \ + dnf config-manager --disable epel && \ + dnf install -y --enablerepo=epel \ + the_silver_searcher \ + bat \ + htop && \ + dnf install -y \ + bison \ + cmake3 \ + ed \ + file \ + flex \ + gcc \ + gcc-c++ \ + gdb \ + glibc-langpack-en \ + glibc-locale-source \ + initscripts \ + iproute \ + less \ + lsof \ + m4 \ + net-tools \ + openssh-clients \ + openssh-server \ + perl \ + rpm-build \ + rpmdevtools \ + rsync \ + sudo \ + tar \ + unzip \ + util-linux-ng \ + wget \ + sshpass \ + which && \ + dnf install -y \ + apr-devel \ + bzip2-devel \ + java-11-openjdk \ + java-11-openjdk-devel \ + krb5-devel \ + libcurl-devel \ + libssh2-devel \ + libevent-devel \ + libxml2-devel \ + libuuid-devel \ + libzstd-devel \ + lz4 \ + lz4-devel \ + openldap-devel \ + openssl-devel \ + pam-devel \ + perl-ExtUtils-Embed \ + perl-Test-Simple \ + perl-core \ + python3-devel \ + python3-pytest \ + readline-devel \ + zlib-devel && \ + dnf install -y --enablerepo=crb \ + liburing-devel \ + libuv-devel \ + libyaml-devel \ + perl-IPC-Run \ + protobuf-devel && \ + dnf clean all && \ + cd && XERCES_LATEST_RELEASE=3.3.0 && \ + wget -nv "https://archive.apache.org/dist/xerces/c/3/sources/xerces-c-${XERCES_LATEST_RELEASE}.tar.gz" && \ + echo "$(curl -sL https://archive.apache.org/dist/xerces/c/3/sources/xerces-c-${XERCES_LATEST_RELEASE}.tar.gz.sha256)" | sha256sum -c - && \ + tar xf "xerces-c-${XERCES_LATEST_RELEASE}.tar.gz"; rm "xerces-c-${XERCES_LATEST_RELEASE}.tar.gz" && \ + cd xerces-c-${XERCES_LATEST_RELEASE} && \ + ./configure --prefix=/usr/local/xerces-c && \ + make -j$(nproc) && \ + make install -C ~/xerces-c-${XERCES_LATEST_RELEASE} && \ + rm -rf ~/xerces-c* && \ + cd && GO_VERSION="go1.23.4" && \ + ARCH=$(uname -m) && \ + if [ "${ARCH}" = "aarch64" ]; then \ + GO_ARCH="arm64" && \ + GO_SHA256="16e5017863a7f6071363782b1b8042eb12c6ca4f4cd71528b2123f0a1275b13e"; \ + elif [ "${ARCH}" = "x86_64" ]; then \ + GO_ARCH="amd64" && \ + GO_SHA256="6924efde5de86fe277676e929dc9917d466efa02fb934197bc2eba35d5680971"; \ + else \ + echo "Unsupported architecture: ${ARCH}" && exit 1; \ + fi && \ + GO_URL="https://go.dev/dl/${GO_VERSION}.linux-${GO_ARCH}.tar.gz" && \ + wget -nv "${GO_URL}" && \ + echo "${GO_SHA256} ${GO_VERSION}.linux-${GO_ARCH}.tar.gz" | sha256sum -c - && \ + tar xf "${GO_VERSION}.linux-${GO_ARCH}.tar.gz" && \ + mv go "/usr/local/${GO_VERSION}" && \ + ln -s "/usr/local/${GO_VERSION}" /usr/local/go && \ + rm -f "${GO_VERSION}.linux-${GO_ARCH}.tar.gz" && \ + echo 'export PATH=$PATH:/usr/local/go/bin' | tee -a /etc/profile.d/go.sh > /dev/null + +# -------------------------------------------------------------------- +# Copy Configuration Files and Setup the Environment +# -------------------------------------------------------------------- +# - Copy custom configuration files from the build context to /tmp/. +# - Apply custom system limits and timezone. +# - Create and configure the 'gpadmin' user with sudo privileges. +# - Set up SSH for password-based authentication. +# - Generate locale and set the default locale to en_US.UTF-8. +# -------------------------------------------------------------------- + +# Copy configuration files from their respective locations +COPY ./configs/* /tmp/ + +RUN cp /tmp/90-cbdb-limits /etc/security/limits.d/90-cbdb-limits && \ + sed -i.bak -r 's/^(session\s+required\s+pam_limits.so)/#\1/' /etc/pam.d/* && \ + cat /usr/share/zoneinfo/${TIMEZONE_VAR} > /etc/localtime && \ + chmod 777 /tmp/init_system.sh && \ + /usr/sbin/groupadd gpadmin && \ + /usr/sbin/useradd gpadmin -g gpadmin -G wheel && \ + setcap cap_net_raw+ep /usr/bin/ping && \ + echo 'gpadmin ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/90-gpadmin && \ + echo -e '\n# Add Cloudberry entries\nif [ -f /usr/local/cbdb/cloudberry-env.sh ]; then\n source /usr/local/cbdb/cloudberry-env.sh\nfi' >> /home/gpadmin/.bashrc && \ + ssh-keygen -A && \ + echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ + localedef -i en_US -f UTF-8 en_US.UTF-8 && \ + echo "LANG=en_US.UTF-8" | tee /etc/locale.conf && \ + dnf clean all # Final cleanup to remove unnecessary files + +# Install testinfra via pip +RUN pip3 install pytest-testinfra + +# Copying test files into the container +COPY ./tests /tests + +# -------------------------------------------------------------------- +# Set the Default User and Command +# -------------------------------------------------------------------- +# The default user is set to 'gpadmin', and the container starts by +# running the init_system.sh script. The container also mounts the +# /sys/fs/cgroup volume for systemd compatibility. +# -------------------------------------------------------------------- +USER gpadmin + +VOLUME [ "/sys/fs/cgroup" ] +CMD ["bash","-c","/tmp/init_system.sh"] diff --git a/devops/deploy/docker/build/rocky9/configs/90-cbdb-limits b/devops/deploy/docker/build/rocky9/configs/90-cbdb-limits new file mode 100644 index 00000000000..474957c42f6 --- /dev/null +++ b/devops/deploy/docker/build/rocky9/configs/90-cbdb-limits @@ -0,0 +1,32 @@ +# /etc/security/limits.d/90-db-limits +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +# Core dump file size limits for gpadmin +gpadmin soft core unlimited +gpadmin hard core unlimited + +# Open file limits for gpadmin +gpadmin soft nofile 524288 +gpadmin hard nofile 524288 + +# Process limits for gpadmin +gpadmin soft nproc 131072 +gpadmin hard nproc 131072 diff --git a/devops/deploy/docker/build/rocky9/configs/gpinitsystem.conf b/devops/deploy/docker/build/rocky9/configs/gpinitsystem.conf new file mode 100644 index 00000000000..d4d312231c5 --- /dev/null +++ b/devops/deploy/docker/build/rocky9/configs/gpinitsystem.conf @@ -0,0 +1,89 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +# -------------------------------------------------------------------- +# gpinitsystem Configuration File for Apache Cloudberry +# -------------------------------------------------------------------- +# This configuration file is used to initialize an Apache Cloudberry +# cluster. It defines the settings for the coordinator, primary segments, +# and mirrors, as well as other important configuration options. +# -------------------------------------------------------------------- + +# Segment prefix - This prefix is used for naming the segment directories. +# For example, the primary segment directories will be named gpseg0, gpseg1, etc. +SEG_PREFIX=gpseg + +# Coordinator port - The port number where the coordinator will listen. +# This is the port used by clients to connect to the database. +COORDINATOR_PORT=5432 + +# Coordinator hostname - The hostname of the machine where the coordinator +# will be running. The $(hostname) command will automatically insert the +# hostname of the current machine. +COORDINATOR_HOSTNAME=$(hostname) + +# Coordinator data directory - The directory where the coordinator's data +# will be stored. This directory should have enough space to store metadata +# and system catalogs. +COORDINATOR_DIRECTORY=/data1/coordinator + +# Base port for primary segments - The starting port number for the primary +# segments. Each primary segment will use a unique port number starting from +# this base. +PORT_BASE=6000 + +# Primary segment data directories - An array specifying the directories where +# the primary segment data will be stored. Each directory corresponds to a +# primary segment. In this case, two primary segments will be created in the +# same directory. +declare -a DATA_DIRECTORY=(/data1/primary /data1/primary) + +# Base port for mirror segments - The starting port number for the mirror +# segments. Each mirror segment will use a unique port number starting from +# this base. +MIRROR_PORT_BASE=7000 + +# Mirror segment data directories - An array specifying the directories where +# the mirror segment data will be stored. Each directory corresponds to a +# mirror segment. In this case, two mirror segments will be created in the +# same directory. +declare -a MIRROR_DATA_DIRECTORY=(/data1/mirror /data1/mirror) + +# Trusted shell - The shell program used for remote execution. Cloudberry uses +# SSH to run commands on other machines in the cluster. 'ssh' is the default. +TRUSTED_SHELL=ssh + +# Database encoding - The character set encoding to be used by the database. +# 'UNICODE' is a common choice, especially for internationalization. +ENCODING=UNICODE + +# Default database name - The name of the default database to be created during +# initialization. This is also the default database that the gpadmin user will +# connect to. +DATABASE_NAME=gpadmin + +# Machine list file - A file containing the list of hostnames where the primary +# segments will be created. Each line in the file represents a different machine. +# This file is critical for setting up the cluster across multiple nodes. +MACHINE_LIST_FILE=/home/gpadmin/hostfile_gpinitsystem + +# -------------------------------------------------------------------- +# End of gpinitsystem Configuration File +# -------------------------------------------------------------------- diff --git a/devops/deploy/docker/build/rocky9/configs/init_system.sh b/devops/deploy/docker/build/rocky9/configs/init_system.sh new file mode 100755 index 00000000000..d8c4a00b035 --- /dev/null +++ b/devops/deploy/docker/build/rocky9/configs/init_system.sh @@ -0,0 +1,192 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +## Container Initialization Script +# -------------------------------------------------------------------- +## This script sets up the environment inside the Docker container for +## the Apache Cloudberry Build Environment. It performs the following +## tasks: +## +## 1. Verifies that the container is running with the expected hostname. +## 2. Starts the SSH daemon to allow SSH access to the container. +## 3. Configures passwordless SSH access for the 'gpadmin' user. +## 4. Displays a welcome banner and system information. +## 5. Starts an interactive bash shell. +## +## This script is intended to be used as an entrypoint or initialization +## script for the Docker container. +# -------------------------------------------------------------------- + +# -------------------------------------------------------------------- +# Check if the hostname is 'cdw' +# -------------------------------------------------------------------- +# The script checks if the container's hostname is set to 'cdw'. This is +# a requirement for this environment, and if the hostname does not match, +# the script will exit with an error message. This ensures consistency +# across different environments. +# -------------------------------------------------------------------- +if [ "$(hostname)" != "cdw" ]; then + echo "Error: This container must be run with the hostname 'cdw'." + echo "Use the following command: docker run -h cdw ..." + exit 1 +fi + +# -------------------------------------------------------------------- +# Start SSH daemon and setup for SSH access +# -------------------------------------------------------------------- +# The SSH daemon is started to allow remote access to the container via +# SSH. This is useful for development and debugging purposes. If the SSH +# daemon fails to start, the script exits with an error. +# -------------------------------------------------------------------- +if ! sudo /usr/sbin/sshd; then + echo "Failed to start SSH daemon" >&2 + exit 1 +fi + +# -------------------------------------------------------------------- +# Remove /run/nologin to allow logins +# -------------------------------------------------------------------- +# The /run/nologin file, if present, prevents users from logging into +# the system. This file is removed to ensure that users can log in via SSH. +# -------------------------------------------------------------------- +sudo rm -rf /run/nologin + +# -------------------------------------------------------------------- +# Configure passwordless SSH access for 'gpadmin' user +# -------------------------------------------------------------------- +# The script sets up SSH key-based authentication for the 'gpadmin' user, +# allowing passwordless SSH access. It generates a new SSH key pair if one +# does not already exist, and configures the necessary permissions. +# -------------------------------------------------------------------- +mkdir -p /home/gpadmin/.ssh +chmod 700 /home/gpadmin/.ssh + +if [ ! -f /home/gpadmin/.ssh/id_rsa ]; then + ssh-keygen -t rsa -b 4096 -C gpadmin -f /home/gpadmin/.ssh/id_rsa -P "" > /dev/null 2>&1 +fi + +cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys +chmod 600 /home/gpadmin/.ssh/authorized_keys + +# Add the container's hostname to the known_hosts file to avoid SSH warnings +ssh-keyscan -t rsa cdw > /home/gpadmin/.ssh/known_hosts 2>/dev/null + +# Change to the home directory of the current user +cd $HOME + +# -------------------------------------------------------------------- +# Display a Welcome Banner +# -------------------------------------------------------------------- +# The following ASCII art and welcome message are displayed when the +# container starts. This banner provides a visual indication that the +# container is running in the Apache Cloudberry Build Environment. +# -------------------------------------------------------------------- +cat <<-'EOF' + +====================================================================== + + ++++++++++ ++++++ + ++++++++++++++ +++++++ + ++++ +++++ ++++ + ++++ +++++++++ + =+==== =============+ + ======== =====+ ===== + ==== ==== ==== ==== + ==== === === ==== + ==== === === ==== + ==== === ==-- === + ===== ===== -- ==== + ===================== ====== + ============================ + =-----= + ____ _ _ _ + / ___|| | ___ _ _ __| || |__ ___ _ __ _ __ _ _ + | | | | / _ \ | | | | / _` || '_ \ / _ \| '__|| '__|| | | | + | |___ | || (_) || |_| || (_| || |_) || __/| | | | | |_| | + \____||_| \____ \__,_| \__,_||_.__/ \___||_| |_| \__, | + |___/ +---------------------------------------------------------------------- + +EOF + +# -------------------------------------------------------------------- +# Display System Information +# -------------------------------------------------------------------- +# The script sources the /etc/os-release file to retrieve the operating +# system name and version. It then displays the following information: +# - OS name and version +# - Current user +# - Container hostname +# - IP address +# - CPU model name and number of cores +# - Total memory available +# This information is useful for users to understand the environment they +# are working in. +# -------------------------------------------------------------------- +source /etc/os-release + +# First, create the CPU info detection function +get_cpu_info() { + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + lscpu | grep 'Model name:' | awk '{print substr($0, index($0,$3))}' + elif [ "$ARCH" = "aarch64" ]; then + VENDOR=$(lscpu | grep 'Vendor ID:' | awk '{print $3}') + if [ "$VENDOR" = "Apple" ] || [ "$VENDOR" = "0x61" ]; then + echo "Apple Silicon ($ARCH)" + else + if [ -f /proc/cpuinfo ]; then + IMPL=$(grep "CPU implementer" /proc/cpuinfo | head -1 | awk '{print $3}') + PART=$(grep "CPU part" /proc/cpuinfo | head -1 | awk '{print $3}') + if [ ! -z "$IMPL" ] && [ ! -z "$PART" ]; then + echo "ARM $ARCH (Implementer: $IMPL, Part: $PART)" + else + echo "ARM $ARCH" + fi + else + echo "ARM $ARCH" + fi + fi + else + echo "Unknown architecture: $ARCH" + fi +} + +cat <<-EOF +Welcome to the Apache Cloudberry Build Environment! + +Container OS ........ : $NAME $VERSION +User ................ : $(whoami) +Container hostname .. : $(hostname) +IP Address .......... : $(hostname -I | awk '{print $1}') +CPU Info ............ : $(get_cpu_info) +CPU(s) .............. : $(nproc) +Memory .............. : $(free -h | grep Mem: | awk '{print $2}') total +====================================================================== + +EOF + +# -------------------------------------------------------------------- +# Start an interactive bash shell +# -------------------------------------------------------------------- +# Finally, the script starts an interactive bash shell to keep the +# container running and allow the user to interact with the environment. +# -------------------------------------------------------------------- +/bin/bash diff --git a/devops/deploy/docker/build/rocky9/tests/requirements.txt b/devops/deploy/docker/build/rocky9/tests/requirements.txt new file mode 100644 index 00000000000..b9711eddac5 --- /dev/null +++ b/devops/deploy/docker/build/rocky9/tests/requirements.txt @@ -0,0 +1,3 @@ +testinfra +pytest-testinfra +paramiko diff --git a/devops/deploy/docker/build/rocky9/tests/testinfra/test_cloudberry_db_env.py b/devops/deploy/docker/build/rocky9/tests/testinfra/test_cloudberry_db_env.py new file mode 100644 index 00000000000..9da7929ff98 --- /dev/null +++ b/devops/deploy/docker/build/rocky9/tests/testinfra/test_cloudberry_db_env.py @@ -0,0 +1,129 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +import testinfra + +def test_installed_packages(host): + """ + Test if the essential packages are installed. + """ + packages = [ + "epel-release", + "git", + "the_silver_searcher", + "bat", + "htop", + "bison", + "cmake", + "gcc", + "gcc-c++", + "glibc-langpack-en", + "glibc-locale-source", + "openssh-clients", + "openssh-server", + "sudo", + "rsync", + "wget", + "openssl-devel", + "python3-devel", + "python3-pytest", + "readline-devel", + "zlib-devel", + "libcurl-devel", + "libevent-devel", + "libxml2-devel", + "libuuid-devel", + "libzstd-devel", + "lz4", + "openldap-devel", + "libuv-devel", + "libyaml-devel" + ] + for package in packages: + pkg = host.package(package) + assert pkg.is_installed + + +def test_user_gpadmin_exists(host): + """ + Test if the gpadmin user exists and is configured properly. + """ + user = host.user("gpadmin") + assert user.exists + assert "wheel" in user.groups + + +def test_ssh_service(host): + """ + Test if SSH service is configured correctly. + """ + sshd_config = host.file("/etc/ssh/sshd_config") + assert sshd_config.exists + + +def test_locale_configured(host): + """ + Test if the locale is configured correctly. + """ + locale_conf = host.file("/etc/locale.conf") + assert locale_conf.exists + assert locale_conf.contains("LANG=en_US.UTF-8") + + +def test_timezone(host): + """ + Test if the timezone is configured correctly. + """ + localtime = host.file("/etc/localtime") + assert localtime.exists + + +def test_system_limits_configured(host): + """ + Test if the custom system limits are applied. + """ + limits_file = host.file("/etc/security/limits.d/90-cbdb-limits") + assert limits_file.exists + + +def test_init_system_script(host): + """ + Test if the init_system.sh script is present and executable. + """ + script = host.file("/tmp/init_system.sh") + assert script.exists + assert script.mode == 0o777 + + +def test_custom_configuration_files(host): + """ + Test if custom configuration files are correctly copied. + """ + config_file = host.file("/tmp/90-cbdb-limits") + assert config_file.exists + + +def test_locale_generated(host): + """ + Test if the en_US.UTF-8 locale is correctly generated. + """ + locale = host.run("locale -a | grep en_US.utf8") + assert locale.exit_status == 0 + assert "en_US.utf8" in locale.stdout diff --git a/devops/deploy/docker/build/ubuntu22.04/Dockerfile b/devops/deploy/docker/build/ubuntu22.04/Dockerfile new file mode 100644 index 00000000000..3023a9fce67 --- /dev/null +++ b/devops/deploy/docker/build/ubuntu22.04/Dockerfile @@ -0,0 +1,199 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Apache Cloudberry (incubating) is an effort undergoing incubation at +# the Apache Software Foundation (ASF), sponsored by the Apache +# Incubator PMC. +# +# Incubation is required of all newly accepted projects until a +# further review indicates that the infrastructure, communications, +# and decision making process have stabilized in a manner consistent +# with other successful ASF projects. +# +# While incubation status is not necessarily a reflection of the +# completeness or stability of the code, it does indicate that the +# project has yet to be fully endorsed by the ASF. +# +# -------------------------------------------------------------------- +# Dockerfile for Apache Cloudberry Base Environment +# -------------------------------------------------------------------- +# This Dockerfile sets up a Ubuntu jammy 22.04 -based container to serve as +# a base environment for evaluating the Apache Cloudberry. It installs +# necessary system utilities, configures the environment for SSH access, +# and sets up a 'gpadmin' user with sudo privileges. The Apache Cloudberry +# DEB can be installed into this container for testing and +# functional verification. +# +# Key Features: +# - Locale setup for en_US.UTF-8 +# - SSH daemon setup for remote access +# - Essential system utilities installation +# - Separate user creation and configuration steps +# +# Security Considerations: +# - This Dockerfile prioritizes ease of use for functional testing and +# evaluation. It includes configurations such as passwordless sudo access +# for the 'gpadmin' user and SSH access with password authentication. +# - These configurations are suitable for testing and development but +# should NOT be used in a production environment due to potential security +# risks. +# +# Usage: +# docker build -t cloudberry-db-base-env . +# docker run -h cdw -it cloudberry-db-base-env +# -------------------------------------------------------------------- + +FROM ubuntu:22.04 + +# Argument for configuring the timezone +ARG TIMEZONE_VAR="Europe/London" + +# Environment variables for locale and user +ENV container=docker +ENV LANG=en_US.UTF-8 +ENV USER=gpadmin +ENV TZ=${TIMEZONE_VAR} +ENV DEBIAN_FRONTEND=noninteractive + +# -------------------------------------------------------------------- +# Install Development Tools and Utilities +# -------------------------------------------------------------------- + +RUN apt-get update && \ + apt-get install -y -qq \ + htop \ + bat \ + silversearcher-ag \ + vim \ + wget && \ + apt-get install -y -qq locales && \ + locale-gen "en_US.UTF-8" && \ + update-locale LANG="en_US.UTF-8" && \ + apt-get install -y -qq \ + bison \ + build-essential \ + cmake \ + dpkg-dev \ + fakeroot \ + flex \ + g++-11 \ + gcc-11 \ + gdb \ + git \ + iproute2 \ + iputils-ping \ + libapr1-dev \ + libbz2-dev \ + libcurl4-gnutls-dev \ + libevent-dev \ + libipc-run-perl \ + libkrb5-dev \ + libldap-dev \ + liblz4-dev \ + libpam0g-dev \ + libperl-dev \ + libprotobuf-dev \ + libreadline-dev \ + libssh2-1-dev \ + libssl-dev \ + liburing-dev \ + libuv1-dev \ + libxerces-c-dev \ + libxml2-dev \ + libyaml-dev \ + libzstd-dev \ + lsof \ + make \ + openssh-server \ + pkg-config \ + protobuf-compiler \ + python3-distutils \ + python3-pip \ + python3-setuptools \ + python3.10 \ + python3.10-dev \ + rsync \ + sudo \ + tzdata \ + zlib1g-dev && \ + apt-get install -y -qq \ + ca-certificates-java \ + cgroup-tools \ + curl \ + debhelper \ + libaprutil1-dev \ + libcgroup1 \ + ninja-build \ + quilt \ + unzip && \ + apt-get clean && rm -rf /var/lib/apt/lists/* && \ + cd && GO_VERSION="go1.23.4" && \ + ARCH=$(uname -m) && \ + if [ "${ARCH}" = "aarch64" ]; then \ + GO_ARCH="arm64" && \ + GO_SHA256="16e5017863a7f6071363782b1b8042eb12c6ca4f4cd71528b2123f0a1275b13e"; \ + elif [ "${ARCH}" = "x86_64" ]; then \ + GO_ARCH="amd64" && \ + GO_SHA256="6924efde5de86fe277676e929dc9917d466efa02fb934197bc2eba35d5680971"; \ + else \ + echo "Unsupported architecture: ${ARCH}" && exit 1; \ + fi && \ + GO_URL="https://go.dev/dl/${GO_VERSION}.linux-${GO_ARCH}.tar.gz" && \ + wget -nv "${GO_URL}" && \ + echo "${GO_SHA256} ${GO_VERSION}.linux-${GO_ARCH}.tar.gz" | sha256sum -c - && \ + tar xf "${GO_VERSION}.linux-${GO_ARCH}.tar.gz" && \ + mv go "/usr/local/${GO_VERSION}" && \ + ln -s "/usr/local/${GO_VERSION}" /usr/local/go && \ + rm -f "${GO_VERSION}.linux-${GO_ARCH}.tar.gz" && \ + echo 'export PATH=$PATH:/usr/local/go/bin' | tee -a /etc/profile.d/go.sh > /dev/null + +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 && \ + update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-11 100 && \ + update-alternatives --set gcc /usr/bin/gcc-11 && \ + update-alternatives --set g++ /usr/bin/g++-11 + +# -------------------------------------------------------------------- +# Copy Configuration Files and Setup the Environment +# -------------------------------------------------------------------- + +COPY ./configs/* /tmp/ + +RUN cp /tmp/90-cbdb-limits /etc/security/limits.d/90-cbdb-limits && \ + ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && \ + echo $TZ > /etc/timezone && \ + chmod 755 /tmp/init_system.sh && \ + /usr/sbin/groupadd gpadmin && \ + /usr/sbin/useradd -m -g gpadmin gpadmin -s /bin/bash && \ + echo 'gpadmin ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/90-gpadmin && \ + chmod 0440 /etc/sudoers.d/90-gpadmin && \ + ssh-keygen -A && \ + mkdir /var/run/sshd && chmod 0755 /var/run/sshd + +# Install testinfra via pip +RUN pip3 install pytest-testinfra + +# Example: Copying test files into the container +COPY tests /tests + +USER gpadmin +WORKDIR /home/gpadmin + +CMD ["bash","-c","/tmp/init_system.sh"] diff --git a/devops/deploy/docker/build/ubuntu22.04/configs/90-cbdb-limits b/devops/deploy/docker/build/ubuntu22.04/configs/90-cbdb-limits new file mode 100644 index 00000000000..8ea1d9ed988 --- /dev/null +++ b/devops/deploy/docker/build/ubuntu22.04/configs/90-cbdb-limits @@ -0,0 +1,32 @@ +# /etc/security/limits.d/90-db-limits + # -------------------------------------------------------------------- + # + # Licensed to the Apache Software Foundation (ASF) under one or more + # contributor license agreements. See the NOTICE file distributed + # with this work for additional information regarding copyright + # ownership. The ASF licenses this file to You under the Apache + # License, Version 2.0 (the "License"); you may not use this file + # except in compliance with the License. You may obtain a copy of the + # License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + # implied. See the License for the specific language governing + # permissions and limitations under the License. + # + # -------------------------------------------------------------------- + + # Core dump file size limits for gpadmin + gpadmin soft core unlimited + gpadmin hard core unlimited + + # Open file limits for gpadmin + gpadmin soft nofile 524288 + gpadmin hard nofile 524288 + + # Process limits for gpadmin + gpadmin soft nproc 131072 + gpadmin hard nproc 131072 diff --git a/devops/deploy/docker/build/ubuntu22.04/configs/gpinitsystem.conf b/devops/deploy/docker/build/ubuntu22.04/configs/gpinitsystem.conf new file mode 100644 index 00000000000..d4d312231c5 --- /dev/null +++ b/devops/deploy/docker/build/ubuntu22.04/configs/gpinitsystem.conf @@ -0,0 +1,89 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +# -------------------------------------------------------------------- +# gpinitsystem Configuration File for Apache Cloudberry +# -------------------------------------------------------------------- +# This configuration file is used to initialize an Apache Cloudberry +# cluster. It defines the settings for the coordinator, primary segments, +# and mirrors, as well as other important configuration options. +# -------------------------------------------------------------------- + +# Segment prefix - This prefix is used for naming the segment directories. +# For example, the primary segment directories will be named gpseg0, gpseg1, etc. +SEG_PREFIX=gpseg + +# Coordinator port - The port number where the coordinator will listen. +# This is the port used by clients to connect to the database. +COORDINATOR_PORT=5432 + +# Coordinator hostname - The hostname of the machine where the coordinator +# will be running. The $(hostname) command will automatically insert the +# hostname of the current machine. +COORDINATOR_HOSTNAME=$(hostname) + +# Coordinator data directory - The directory where the coordinator's data +# will be stored. This directory should have enough space to store metadata +# and system catalogs. +COORDINATOR_DIRECTORY=/data1/coordinator + +# Base port for primary segments - The starting port number for the primary +# segments. Each primary segment will use a unique port number starting from +# this base. +PORT_BASE=6000 + +# Primary segment data directories - An array specifying the directories where +# the primary segment data will be stored. Each directory corresponds to a +# primary segment. In this case, two primary segments will be created in the +# same directory. +declare -a DATA_DIRECTORY=(/data1/primary /data1/primary) + +# Base port for mirror segments - The starting port number for the mirror +# segments. Each mirror segment will use a unique port number starting from +# this base. +MIRROR_PORT_BASE=7000 + +# Mirror segment data directories - An array specifying the directories where +# the mirror segment data will be stored. Each directory corresponds to a +# mirror segment. In this case, two mirror segments will be created in the +# same directory. +declare -a MIRROR_DATA_DIRECTORY=(/data1/mirror /data1/mirror) + +# Trusted shell - The shell program used for remote execution. Cloudberry uses +# SSH to run commands on other machines in the cluster. 'ssh' is the default. +TRUSTED_SHELL=ssh + +# Database encoding - The character set encoding to be used by the database. +# 'UNICODE' is a common choice, especially for internationalization. +ENCODING=UNICODE + +# Default database name - The name of the default database to be created during +# initialization. This is also the default database that the gpadmin user will +# connect to. +DATABASE_NAME=gpadmin + +# Machine list file - A file containing the list of hostnames where the primary +# segments will be created. Each line in the file represents a different machine. +# This file is critical for setting up the cluster across multiple nodes. +MACHINE_LIST_FILE=/home/gpadmin/hostfile_gpinitsystem + +# -------------------------------------------------------------------- +# End of gpinitsystem Configuration File +# -------------------------------------------------------------------- diff --git a/devops/deploy/docker/build/ubuntu22.04/configs/init_system.sh b/devops/deploy/docker/build/ubuntu22.04/configs/init_system.sh new file mode 100755 index 00000000000..52a44462c61 --- /dev/null +++ b/devops/deploy/docker/build/ubuntu22.04/configs/init_system.sh @@ -0,0 +1,195 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +## Container Initialization Script +# -------------------------------------------------------------------- +## This script sets up the environment inside the Docker container for +## the Apache Cloudberry Build Environment. It performs the following +## tasks: +## +## 1. Verifies that the container is running with the expected hostname. +## 2. Starts the SSH daemon to allow SSH access to the container. +## 3. Configures passwordless SSH access for the 'gpadmin' user. +## 4. Displays a welcome banner and system information. +## 5. Starts an interactive bash shell. +## +## This script is intended to be used as an entrypoint or initialization +## script for the Docker container. +# -------------------------------------------------------------------- + +# -------------------------------------------------------------------- +# Check if the hostname is 'cdw' +# -------------------------------------------------------------------- +# The script checks if the container's hostname is set to 'cdw'. This is +# a requirement for this environment, and if the hostname does not match, +# the script will exit with an error message. This ensures consistency +# across different environments. +# -------------------------------------------------------------------- +if [ "$(hostname)" != "cdw" ]; then + echo "Error: This container must be run with the hostname 'cdw'." + echo "Use the following command: docker run -h cdw ..." + exit 1 +fi + +# -------------------------------------------------------------------- +# Start SSH daemon and setup for SSH access +# -------------------------------------------------------------------- +# The SSH daemon is started to allow remote access to the container via +# SSH. This is useful for development and debugging purposes. If the SSH +# daemon fails to start, the script exits with an error. +# -------------------------------------------------------------------- +if ! sudo /usr/sbin/sshd; then + echo "Failed to start SSH daemon" >&2 + exit 1 +fi + +# -------------------------------------------------------------------- +# Remove /run/nologin to allow logins +# -------------------------------------------------------------------- +# The /run/nologin file, if present, prevents users from logging into +# the system. This file is removed to ensure that users can log in via SSH. +# -------------------------------------------------------------------- +sudo rm -rf /run/nologin + +# -------------------------------------------------------------------- +# Configure passwordless SSH access for 'gpadmin' user +# -------------------------------------------------------------------- +# The script sets up SSH key-based authentication for the 'gpadmin' user, +# allowing passwordless SSH access. It generates a new SSH key pair if one +# does not already exist, and configures the necessary permissions. +# -------------------------------------------------------------------- +mkdir -p /home/gpadmin/.ssh +chmod 700 /home/gpadmin/.ssh + +if [ ! -f /home/gpadmin/.ssh/id_rsa ]; then + ssh-keygen -t rsa -b 4096 -C gpadmin -f /home/gpadmin/.ssh/id_rsa -P "" > /dev/null 2>&1 +fi + +cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys +chmod 600 /home/gpadmin/.ssh/authorized_keys + +# Add the container's hostname to the known_hosts file to avoid SSH warnings +ssh-keyscan -t rsa cdw > /home/gpadmin/.ssh/known_hosts 2>/dev/null + +# Change to the home directory of the current user +cd $HOME + +# -------------------------------------------------------------------- +# Display a Welcome Banner +# -------------------------------------------------------------------- +# The following ASCII art and welcome message are displayed when the +# container starts. This banner provides a visual indication that the +# container is running in the Apache Cloudberry Build Environment. +# -------------------------------------------------------------------- +cat <<-'EOF' + +====================================================================== + + ++++++++++ ++++++ + ++++++++++++++ +++++++ + ++++ +++++ ++++ + ++++ +++++++++ + =+==== =============+ + ======== =====+ ===== + ==== ==== ==== ==== + ==== === === ==== + ==== === === ==== + ==== === ==-- === + ===== ===== -- ==== + ===================== ====== + ============================ + =-----= + ____ _ _ _ + / ___|| | ___ _ _ __| || |__ ___ _ __ _ __ _ _ + | | | | / _ \ | | | | / _` || '_ \ / _ \| '__|| '__|| | | | + | |___ | || (_) || |_| || (_| || |_) || __/| | | | | |_| | + \____||_| \____ \__,_| \__,_||_.__/ \___||_| |_| \__, | + |___/ +---------------------------------------------------------------------- + +EOF + +# -------------------------------------------------------------------- +# Display System Information +# -------------------------------------------------------------------- +# The script sources the /etc/os-release file to retrieve the operating +# system name and version. It then displays the following information: +# - OS name and version +# - Current user +# - Container hostname +# - IP address +# - CPU model name and number of cores +# - Total memory available +# This information is useful for users to understand the environment they +# are working in. +# -------------------------------------------------------------------- +source /etc/os-release + +# First, create the CPU info detection function +get_cpu_info() { + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + lscpu | grep 'Model name:' | awk '{print substr($0, index($0,$3))}' + elif [ "$ARCH" = "aarch64" ]; then + VENDOR=$(lscpu | grep 'Vendor ID:' | awk '{print $3}') + if [ "$VENDOR" = "Apple" ] || [ "$VENDOR" = "0x61" ]; then + echo "Apple Silicon ($ARCH)" + else + if [ -f /proc/cpuinfo ]; then + IMPL=$(grep "CPU implementer" /proc/cpuinfo | head -1 | awk '{print $3}') + PART=$(grep "CPU part" /proc/cpuinfo | head -1 | awk '{print $3}') + if [ ! -z "$IMPL" ] && [ ! -z "$PART" ]; then + echo "ARM $ARCH (Implementer: $IMPL, Part: $PART)" + else + echo "ARM $ARCH" + fi + else + echo "ARM $ARCH" + fi + fi + else + echo "Unknown architecture: $ARCH" + fi +} + +cat <<-EOF +Welcome to the Apache Cloudberry Build Environment! + +Container OS ........ : $NAME $VERSION +User ................ : $(whoami) +Container hostname .. : $(hostname) +IP Address .......... : $(hostname -I | awk '{print $1}') +CPU Info ............ : $(get_cpu_info) +CPU(s) .............. : $(nproc) +Memory .............. : $(free -h | grep Mem: | awk '{print $2}') total +====================================================================== + +EOF + +# Add go to PATH +source /etc/profile.d/go.sh + +# -------------------------------------------------------------------- +# Start an interactive bash shell +# -------------------------------------------------------------------- +# Finally, the script starts an interactive bash shell to keep the +# container running and allow the user to interact with the environment. +# -------------------------------------------------------------------- +/bin/bash diff --git a/devops/deploy/docker/build/ubuntu22.04/tests/requirements.txt b/devops/deploy/docker/build/ubuntu22.04/tests/requirements.txt new file mode 100644 index 00000000000..b9711eddac5 --- /dev/null +++ b/devops/deploy/docker/build/ubuntu22.04/tests/requirements.txt @@ -0,0 +1,3 @@ +testinfra +pytest-testinfra +paramiko diff --git a/devops/deploy/docker/build/ubuntu22.04/tests/testinfra/test_cloudberry_db_env.py b/devops/deploy/docker/build/ubuntu22.04/tests/testinfra/test_cloudberry_db_env.py new file mode 100644 index 00000000000..d7f018ab9df --- /dev/null +++ b/devops/deploy/docker/build/ubuntu22.04/tests/testinfra/test_cloudberry_db_env.py @@ -0,0 +1,128 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +import testinfra + +def test_installed_packages(host): + """ + Test if the essential packages are installed. + """ + packages = [ + "bat", + "bison", + "cmake", + "flex", + "g++-11", + "gcc-11", + "git", + "htop", + "iproute2", + "iputils-ping", + "libapr1-dev", + "libbz2-dev", + "libcurl4-gnutls-dev", + "libevent-dev", + "libipc-run-perl", + "libkrb5-dev", + "libldap-dev", + "liblz4-dev", + "libpam0g-dev", + "libperl-dev", + "libprotobuf-dev", + "libreadline-dev", + "libssl-dev", + "libuv1-dev", + "libxerces-c-dev", + "libxml2-dev", + "libyaml-dev", + "libzstd-dev", + "locales", + "lsof", + "make", + "openssh-server", + "pkg-config", + "protobuf-compiler", + "python3-distutils", + "python3-pip", + "python3-setuptools", + "python3.10", + "python3.10-dev", + "rsync", + "silversearcher-ag", + "sudo", + "tzdata", + "vim", + "wget", + "zlib1g-dev" + ] + for package in packages: + pkg = host.package(package) + assert pkg.is_installed + + +def test_user_gpadmin_exists(host): + """ + Test if the gpadmin user exists and is configured properly. + """ + user = host.user("gpadmin") + assert user.exists + assert "gpadmin" in user.groups + + +def test_ssh_service(host): + """ + Test if SSH service is configured correctly. + """ + sshd_config = host.file("/etc/ssh/sshd_config") + assert sshd_config.exists + + +def test_timezone(host): + """ + Test if the timezone is configured correctly. + """ + localtime = host.file("/etc/localtime") + assert localtime.exists + + +def test_system_limits_configured(host): + """ + Test if the custom system limits are applied. + """ + limits_file = host.file("/etc/security/limits.d/90-cbdb-limits") + assert limits_file.exists + + +def test_init_system_script(host): + """ + Test if the init_system.sh script is present and executable. + """ + script = host.file("/tmp/init_system.sh") + assert script.exists + assert script.mode == 0o755 + + +def test_locale_generated(host): + """ + Test if the en_US.UTF-8 locale is correctly generated. + """ + locale = host.run("locale -a | grep en_US.utf8") + assert locale.exit_status == 0 + assert "en_US.utf8" in locale.stdout diff --git a/devops/deploy/docker/test/rocky8/Dockerfile b/devops/deploy/docker/test/rocky8/Dockerfile new file mode 100644 index 00000000000..5f6858d41ae --- /dev/null +++ b/devops/deploy/docker/test/rocky8/Dockerfile @@ -0,0 +1,135 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Apache Cloudberry (Incubating) is an effort undergoing incubation at +# the Apache Software Foundation (ASF), sponsored by the Apache +# Incubator PMC. +# +# Incubation is required of all newly accepted projects until a +# further review indicates that the infrastructure, communications, +# and decision making process have stabilized in a manner consistent +# with other successful ASF projects. +# +# While incubation status is not necessarily a reflection of the +# completeness or stability of the code, it does indicate that the +# project has yet to be fully endorsed by the ASF. +# +# -------------------------------------------------------------------- +# Dockerfile for Apache Cloudberry Base Environment +# -------------------------------------------------------------------- +# This Dockerfile sets up a Rocky Linux 8-based container to serve as +# a base environment for evaluating Apache Cloudberry. It installs +# necessary system utilities, configures the environment for SSH access, +# and sets up a 'gpadmin' user with sudo privileges. The Apache +# Cloudberry RPM can be installed into this container for testing and +# functional verification. +# +# Key Features: +# - Locale setup for en_US.UTF-8 +# - SSH daemon setup for remote access +# - Essential system utilities installation +# - Separate user creation and configuration steps +# +# Security Considerations: +# - This Dockerfile prioritizes ease of use for functional testing and +# evaluation. It includes configurations such as passwordless sudo access +# for the 'gpadmin' user and SSH access with password authentication. +# - These configurations are suitable for testing and development but +# should NOT be used in a production environment due to potential security +# risks. +# +# Usage: +# docker build -t cloudberry-db-base-env . +# docker run -h cdw -it cloudberry-db-base-env +# -------------------------------------------------------------------- + +# Base image: Rocky Linux 8 +FROM rockylinux/rockylinux:8 + +# Argument for configuring the timezone +ARG TIMEZONE_VAR="America/Los_Angeles" + +# Environment variables for locale +ENV LANG=en_US.UTF-8 + +# -------------------------------------------------------------------- +# System Update and Installation +# -------------------------------------------------------------------- +# Update the system and install essential system utilities required for +# running and testing Apache Cloudberry. Cleanup the DNF cache afterward +# to reduce the image size. +# -------------------------------------------------------------------- +RUN dnf install -y \ + file \ + gdb \ + glibc-locale-source \ + make \ + openssh \ + openssh-clients \ + openssh-server \ + procps-ng \ + sudo \ + which \ + && \ + dnf clean all # Clean up DNF cache after package installations + +# -------------------------------------------------------------------- +# User Creation and Configuration +# -------------------------------------------------------------------- +# - Create the 'gpadmin' user and group. +# - Configure the 'gpadmin' user with passwordless sudo privileges. +# - Add Cloudberry-specific entries to the gpadmin's .bashrc. +# -------------------------------------------------------------------- +RUN /usr/sbin/groupadd gpadmin && \ + /usr/sbin/useradd gpadmin -g gpadmin -G wheel && \ + echo 'gpadmin ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/90-gpadmin && \ + echo -e '\n# Add Cloudberry entries\nif [ -f /usr/local/cloudberry/cloudberry-env.sh ]; then\n source /usr/local/cloudberry/cloudberry-env.sh\n export COORDINATOR_DATA_DIRECTORY=/data1/coordinator/gpseg-1\nfi' >> /home/gpadmin/.bashrc + +# -------------------------------------------------------------------- +# Copy Configuration Files and Setup the Environment +# -------------------------------------------------------------------- +# - Copy custom configuration files from the build context to /tmp/. +# - Apply custom system limits and timezone. +# - Set up SSH for password-based authentication. +# - Generate locale and set the default locale to en_US.UTF-8. +# -------------------------------------------------------------------- +COPY ./configs/* /tmp/ + +RUN cp /tmp/90-cbdb-limits /etc/security/limits.d/90-cbdb-limits && \ + sed -i.bak -r 's/^(session\s+required\s+pam_limits.so)/#\1/' /etc/pam.d/* && \ + cat /usr/share/zoneinfo/${TIMEZONE_VAR} > /etc/localtime && \ + chmod 777 /tmp/init_system.sh && \ + setcap cap_net_raw+ep /usr/bin/ping && \ + ssh-keygen -A && \ + echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ + localedef -i en_US -f UTF-8 en_US.UTF-8 && \ + echo "LANG=en_US.UTF-8" | tee /etc/locale.conf + +# -------------------------------------------------------------------- +# Set the Default User and Command +# -------------------------------------------------------------------- +# The default user is set to 'gpadmin', and the container starts by +# running the init_system.sh script. This container serves as a base +# environment, and the Apache Cloudberry RPM can be installed for +# testing and functional verification. +# -------------------------------------------------------------------- +USER gpadmin + +CMD ["bash","-c","/tmp/init_system.sh"] diff --git a/devops/deploy/docker/test/rocky8/configs/90-cbdb-limits b/devops/deploy/docker/test/rocky8/configs/90-cbdb-limits new file mode 100644 index 00000000000..474957c42f6 --- /dev/null +++ b/devops/deploy/docker/test/rocky8/configs/90-cbdb-limits @@ -0,0 +1,32 @@ +# /etc/security/limits.d/90-db-limits +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +# Core dump file size limits for gpadmin +gpadmin soft core unlimited +gpadmin hard core unlimited + +# Open file limits for gpadmin +gpadmin soft nofile 524288 +gpadmin hard nofile 524288 + +# Process limits for gpadmin +gpadmin soft nproc 131072 +gpadmin hard nproc 131072 diff --git a/devops/deploy/docker/test/rocky8/configs/gpinitsystem.conf b/devops/deploy/docker/test/rocky8/configs/gpinitsystem.conf new file mode 100644 index 00000000000..2676929523b --- /dev/null +++ b/devops/deploy/docker/test/rocky8/configs/gpinitsystem.conf @@ -0,0 +1,87 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# gpinitsystem Configuration File for Apache Cloudberry +# -------------------------------------------------------------------- +# This configuration file is used to initialize a Apache Cloudberry +# cluster. It defines the settings for the coordinator, primary segments, +# and mirrors, as well as other important configuration options. +# -------------------------------------------------------------------- + +# Segment prefix - This prefix is used for naming the segment directories. +# For example, the primary segment directories will be named gpseg0, gpseg1, etc. +SEG_PREFIX=gpseg + +# Coordinator port - The port number where the coordinator will listen. +# This is the port used by clients to connect to the database. +COORDINATOR_PORT=5432 + +# Coordinator hostname - The hostname of the machine where the coordinator +# will be running. The $(hostname) command will automatically insert the +# hostname of the current machine. +COORDINATOR_HOSTNAME=$(hostname) + +# Coordinator data directory - The directory where the coordinator's data +# will be stored. This directory should have enough space to store metadata +# and system catalogs. +COORDINATOR_DIRECTORY=/data1/coordinator + +# Base port for primary segments - The starting port number for the primary +# segments. Each primary segment will use a unique port number starting from +# this base. +PORT_BASE=6000 + +# Primary segment data directories - An array specifying the directories where +# the primary segment data will be stored. Each directory corresponds to a +# primary segment. In this case, two primary segments will be created in the +# same directory. +declare -a DATA_DIRECTORY=(/data1/primary /data1/primary) + +# Base port for mirror segments - The starting port number for the mirror +# segments. Each mirror segment will use a unique port number starting from +# this base. +MIRROR_PORT_BASE=7000 + +# Mirror segment data directories - An array specifying the directories where +# the mirror segment data will be stored. Each directory corresponds to a +# mirror segment. In this case, two mirror segments will be created in the +# same directory. +declare -a MIRROR_DATA_DIRECTORY=(/data1/mirror /data1/mirror) + +# Trusted shell - The shell program used for remote execution. Cloudberry uses +# SSH to run commands on other machines in the cluster. 'ssh' is the default. +TRUSTED_SHELL=ssh + +# Database encoding - The character set encoding to be used by the database. +# 'UNICODE' is a common choice, especially for internationalization. +ENCODING=UNICODE + +# Default database name - The name of the default database to be created during +# initialization. This is also the default database that the gpadmin user will +# connect to. +DATABASE_NAME=gpadmin + +# Machine list file - A file containing the list of hostnames where the primary +# segments will be created. Each line in the file represents a different machine. +# This file is critical for setting up the cluster across multiple nodes. +MACHINE_LIST_FILE=/home/gpadmin/hostfile_gpinitsystem + +# -------------------------------------------------------------------- +# End of gpinitsystem Configuration File +# -------------------------------------------------------------------- diff --git a/devops/deploy/docker/test/rocky8/configs/init_system.sh b/devops/deploy/docker/test/rocky8/configs/init_system.sh new file mode 100755 index 00000000000..3ea7e34b0ff --- /dev/null +++ b/devops/deploy/docker/test/rocky8/configs/init_system.sh @@ -0,0 +1,221 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Container Initialization Script +# -------------------------------------------------------------------- +# This script sets up the environment inside the Docker container for +# the Apache Cloudberry Build Environment. It performs the following +# tasks: +# +# 1. Verifies that the container is running with the expected hostname. +# 2. Starts the SSH daemon to allow SSH access to the container. +# 3. Configures passwordless SSH access for the 'gpadmin' user. +# 4. Sets up the necessary directories and configuration files for +# Apache Cloudberry. +# 5. Displays a welcome banner and system information. +# 6. Starts an interactive bash shell. +# +# This script is intended to be used as an entrypoint or initialization +# script for the Docker container. +# -------------------------------------------------------------------- + +# -------------------------------------------------------------------- +# Check if the hostname is 'cdw' +# -------------------------------------------------------------------- +# The script checks if the container's hostname is set to 'cdw'. This is +# a requirement for this environment, and if the hostname does not match, +# the script will exit with an error message. This ensures consistency +# across different environments. +# -------------------------------------------------------------------- +if [ "$(hostname)" != "cdw" ]; then + echo "Error: This container must be run with the hostname 'cdw'." + echo "Use the following command: docker run -h cdw ..." + exit 1 +fi + +# -------------------------------------------------------------------- +# Start SSH daemon and setup for SSH access +# -------------------------------------------------------------------- +# The SSH daemon is started to allow remote access to the container via +# SSH. This is useful for development and debugging purposes. If the SSH +# daemon fails to start, the script exits with an error. +# -------------------------------------------------------------------- +if ! sudo /usr/sbin/sshd; then + echo "Failed to start SSH daemon" >&2 + exit 1 +fi + +# -------------------------------------------------------------------- +# Remove /run/nologin to allow logins +# -------------------------------------------------------------------- +# The /run/nologin file, if present, prevents users from logging into +# the system. This file is removed to ensure that users can log in via SSH. +# -------------------------------------------------------------------- +sudo rm -rf /run/nologin + +# -------------------------------------------------------------------- +# Configure passwordless SSH access for 'gpadmin' user +# -------------------------------------------------------------------- +# The script sets up SSH key-based authentication for the 'gpadmin' user, +# allowing passwordless SSH access. It generates a new SSH key pair if one +# does not already exist, and configures the necessary permissions. +# -------------------------------------------------------------------- +mkdir -p /home/gpadmin/.ssh +chmod 700 /home/gpadmin/.ssh + +if [ ! -f /home/gpadmin/.ssh/id_rsa ]; then + ssh-keygen -t rsa -b 4096 -C gpadmin -f /home/gpadmin/.ssh/id_rsa -P "" > /dev/null 2>&1 +fi + +cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys +chmod 600 /home/gpadmin/.ssh/authorized_keys + +# Add the container's hostname to the known_hosts file to avoid SSH warnings +ssh-keyscan -t rsa cdw > /home/gpadmin/.ssh/known_hosts 2>/dev/null + +# -------------------------------------------------------------------- +# Cloudberry Data Directories Setup +# -------------------------------------------------------------------- +# The script sets up the necessary directories for Apache Cloudberry, +# including directories for the coordinator, standby coordinator, primary +# segments, and mirror segments. It also sets up the configuration files +# required for initializing the database. +# -------------------------------------------------------------------- +sudo rm -rf /data1/* +sudo mkdir -p /data1/coordinator /data1/standby_coordinator /data1/primary /data1/mirror +sudo chown -R gpadmin.gpadmin /data1 + +# Copy the gpinitsystem configuration file to the home directory +cp /tmp/gpinitsystem.conf /home/gpadmin + +# Set up the hostfile for cluster initialization +echo $(hostname) > /home/gpadmin/hostfile_gpinitsystem + +# Change to the home directory of the current user +cd $HOME + +# -------------------------------------------------------------------- +# Display a Welcome Banner +# -------------------------------------------------------------------- +# The following ASCII art and welcome message are displayed when the +# container starts. This banner provides a visual indication that the +# container is running in the Apache Cloudberry Build Environment. +# -------------------------------------------------------------------- +cat <<-'EOF' + +====================================================================== + + ++++++++++ ++++++ + ++++++++++++++ +++++++ + ++++ +++++ ++++ + ++++ +++++++++ + =+==== =============+ + ======== =====+ ===== + ==== ==== ==== ==== + ==== === === ==== + ==== === === ==== + ==== === ==-- === + ===== ===== -- ==== + ===================== ====== + ============================ + =-----= + ____ _ _ _ + / ___|| | ___ _ _ __| || |__ ___ _ __ _ __ _ _ + | | | | / _ \ | | | | / _` || '_ \ / _ \| '__|| '__|| | | | + | |___ | || (_) || |_| || (_| || |_) || __/| | | | | |_| | + \____||_| \____ \__,_| \__,_||_.__/ \___||_| |_| \__, | + |___/ +---------------------------------------------------------------------- + +EOF + +# -------------------------------------------------------------------- +# Display System Information +# -------------------------------------------------------------------- +# The script sources the /etc/os-release file to retrieve the operating +# system name and version. It then displays the following information: +# - OS name and version +# - Current user +# - Container hostname +# - IP address +# - CPU model name and number of cores +# - Total memory available +# - Cloudberry version (if installed) +# This information is useful for users to understand the environment they +# are working in. +# -------------------------------------------------------------------- +source /etc/os-release + +# First, create the CPU info detection function +get_cpu_info() { + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + lscpu | grep 'Model name:' | awk '{print substr($0, index($0,$3))}' + elif [ "$ARCH" = "aarch64" ]; then + VENDOR=$(lscpu | grep 'Vendor ID:' | awk '{print $3}') + if [ "$VENDOR" = "Apple" ] || [ "$VENDOR" = "0x61" ]; then + echo "Apple Silicon ($ARCH)" + else + if [ -f /proc/cpuinfo ]; then + IMPL=$(grep "CPU implementer" /proc/cpuinfo | head -1 | awk '{print $3}') + PART=$(grep "CPU part" /proc/cpuinfo | head -1 | awk '{print $3}') + if [ ! -z "$IMPL" ] && [ ! -z "$PART" ]; then + echo "ARM $ARCH (Implementer: $IMPL, Part: $PART)" + else + echo "ARM $ARCH" + fi + else + echo "ARM $ARCH" + fi + fi + else + echo "Unknown architecture: $ARCH" + fi +} + +# Check if Apache Cloudberry is installed and display its version +if rpm -q apache-cloudberry-db-incubating > /dev/null 2>&1; then + CBDB_VERSION=$(/usr/local/cbdb/bin/postgres --gp-version) +else + CBDB_VERSION="Not installed" +fi + +cat <<-EOF +Welcome to the Apache Cloudberry Test Environment! + +Cloudberry version .. : $CBDB_VERSION +Container OS ........ : $NAME $VERSION +User ................ : $(whoami) +Container hostname .. : $(hostname) +IP Address .......... : $(hostname -I | awk '{print $1}') +CPU Info ............ : $(get_cpu_info) +CPU(s) .............. : $(nproc) +Memory .............. : $(free -h | grep Mem: | awk '{print $2}') total +====================================================================== + +EOF + +# -------------------------------------------------------------------- +# Start an interactive bash shell +# -------------------------------------------------------------------- +# Finally, the script starts an interactive bash shell to keep the +# container running and allow the user to interact with the environment. +# -------------------------------------------------------------------- +/bin/bash diff --git a/devops/deploy/docker/test/rocky9/Dockerfile b/devops/deploy/docker/test/rocky9/Dockerfile new file mode 100644 index 00000000000..f5e1e3fbee3 --- /dev/null +++ b/devops/deploy/docker/test/rocky9/Dockerfile @@ -0,0 +1,135 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Apache Cloudberry (Incubating) is an effort undergoing incubation at +# the Apache Software Foundation (ASF), sponsored by the Apache +# Incubator PMC. +# +# Incubation is required of all newly accepted projects until a +# further review indicates that the infrastructure, communications, +# and decision making process have stabilized in a manner consistent +# with other successful ASF projects. +# +# While incubation status is not necessarily a reflection of the +# completeness or stability of the code, it does indicate that the +# project has yet to be fully endorsed by the ASF. +# +# -------------------------------------------------------------------- +# Dockerfile for Apache Cloudberry Base Environment +# -------------------------------------------------------------------- +# This Dockerfile sets up a Rocky Linux 9-based container to serve as +# a base environment for evaluating the Apache Cloudberry. It installs +# necessary system utilities, configures the environment for SSH access, +# and sets up a 'gpadmin' user with sudo privileges. The Cloudberry +# Database RPM can be installed into this container for testing and +# functional verification. +# +# Key Features: +# - Locale setup for en_US.UTF-8 +# - SSH daemon setup for remote access +# - Essential system utilities installation +# - Separate user creation and configuration steps +# +# Security Considerations: +# - This Dockerfile prioritizes ease of use for functional testing and +# evaluation. It includes configurations such as passwordless sudo access +# for the 'gpadmin' user and SSH access with password authentication. +# - These configurations are suitable for testing and development but +# should NOT be used in a production environment due to potential security +# risks. +# +# Usage: +# docker build -t cloudberry-db-base-env . +# docker run -h cdw -it cloudberry-db-base-env +# -------------------------------------------------------------------- + +# Base image: Rocky Linux 9 +FROM rockylinux/rockylinux:9 + +# Argument for configuring the timezone +ARG TIMEZONE_VAR="America/Los_Angeles" + +# Environment variables for locale +ENV LANG=en_US.UTF-8 + +# -------------------------------------------------------------------- +# System Update and Installation +# -------------------------------------------------------------------- +# Update the system and install essential system utilities required for +# running and testing Apache Cloudberry. Cleanup the DNF cache afterward +# to reduce the image size. +# -------------------------------------------------------------------- +RUN dnf install -y \ + file \ + gdb \ + glibc-locale-source \ + make \ + openssh \ + openssh-clients \ + openssh-server \ + procps-ng \ + sudo \ + which \ + && \ + dnf clean all # Clean up DNF cache after package installations + +# -------------------------------------------------------------------- +# User Creation and Configuration +# -------------------------------------------------------------------- +# - Create the 'gpadmin' user and group. +# - Configure the 'gpadmin' user with passwordless sudo privileges. +# - Add Cloudberry-specific entries to the gpadmin's .bashrc. +# -------------------------------------------------------------------- +RUN /usr/sbin/groupadd gpadmin && \ + /usr/sbin/useradd gpadmin -g gpadmin -G wheel && \ + echo 'gpadmin ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/90-gpadmin && \ + echo -e '\n# Add Cloudberry entries\nif [ -f /usr/local/cloudberry/cloudberry-env.sh ]; then\n source /usr/local/cloudberry/cloudberry-env.sh\n export COORDINATOR_DATA_DIRECTORY=/data1/coordinator/gpseg-1\nfi' >> /home/gpadmin/.bashrc + +# -------------------------------------------------------------------- +# Copy Configuration Files and Setup the Environment +# -------------------------------------------------------------------- +# - Copy custom configuration files from the build context to /tmp/. +# - Apply custom system limits and timezone. +# - Set up SSH for password-based authentication. +# - Generate locale and set the default locale to en_US.UTF-8. +# -------------------------------------------------------------------- +COPY ./configs/* /tmp/ + +RUN cp /tmp/90-cbdb-limits /etc/security/limits.d/90-cbdb-limits && \ + sed -i.bak -r 's/^(session\s+required\s+pam_limits.so)/#\1/' /etc/pam.d/* && \ + cat /usr/share/zoneinfo/${TIMEZONE_VAR} > /etc/localtime && \ + chmod 777 /tmp/init_system.sh && \ + setcap cap_net_raw+ep /usr/bin/ping && \ + ssh-keygen -A && \ + echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ + localedef -i en_US -f UTF-8 en_US.UTF-8 && \ + echo "LANG=en_US.UTF-8" | tee /etc/locale.conf + +# -------------------------------------------------------------------- +# Set the Default User and Command +# -------------------------------------------------------------------- +# The default user is set to 'gpadmin', and the container starts by +# running the init_system.sh script. This container serves as a base +# environment, and the Apache Cloudberry RPM can be installed for +# testing and functional verification. +# -------------------------------------------------------------------- +USER gpadmin + +CMD ["bash","-c","/tmp/init_system.sh"] diff --git a/devops/deploy/docker/test/rocky9/configs/90-cbdb-limits b/devops/deploy/docker/test/rocky9/configs/90-cbdb-limits new file mode 100644 index 00000000000..474957c42f6 --- /dev/null +++ b/devops/deploy/docker/test/rocky9/configs/90-cbdb-limits @@ -0,0 +1,32 @@ +# /etc/security/limits.d/90-db-limits +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +# Core dump file size limits for gpadmin +gpadmin soft core unlimited +gpadmin hard core unlimited + +# Open file limits for gpadmin +gpadmin soft nofile 524288 +gpadmin hard nofile 524288 + +# Process limits for gpadmin +gpadmin soft nproc 131072 +gpadmin hard nproc 131072 diff --git a/devops/deploy/docker/test/rocky9/configs/gpinitsystem.conf b/devops/deploy/docker/test/rocky9/configs/gpinitsystem.conf new file mode 100644 index 00000000000..bb8f38d4f2d --- /dev/null +++ b/devops/deploy/docker/test/rocky9/configs/gpinitsystem.conf @@ -0,0 +1,87 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# ---------------------------------------------------------------------- +# gpinitsystem Configuration File for Apache Cloudberry +# ---------------------------------------------------------------------- +# This configuration file is used to initialize a Apache Cloudberry +# cluster. It defines the settings for the coordinator, primary segments, +# and mirrors, as well as other important configuration options. +# ---------------------------------------------------------------------- + +# Segment prefix - This prefix is used for naming the segment directories. +# For example, the primary segment directories will be named gpseg0, gpseg1, etc. +SEG_PREFIX=gpseg + +# Coordinator port - The port number where the coordinator will listen. +# This is the port used by clients to connect to the database. +COORDINATOR_PORT=5432 + +# Coordinator hostname - The hostname of the machine where the coordinator +# will be running. The $(hostname) command will automatically insert the +# hostname of the current machine. +COORDINATOR_HOSTNAME=$(hostname) + +# Coordinator data directory - The directory where the coordinator's data +# will be stored. This directory should have enough space to store metadata +# and system catalogs. +COORDINATOR_DIRECTORY=/data1/coordinator + +# Base port for primary segments - The starting port number for the primary +# segments. Each primary segment will use a unique port number starting from +# this base. +PORT_BASE=6000 + +# Primary segment data directories - An array specifying the directories where +# the primary segment data will be stored. Each directory corresponds to a +# primary segment. In this case, two primary segments will be created in the +# same directory. +declare -a DATA_DIRECTORY=(/data1/primary /data1/primary) + +# Base port for mirror segments - The starting port number for the mirror +# segments. Each mirror segment will use a unique port number starting from +# this base. +MIRROR_PORT_BASE=7000 + +# Mirror segment data directories - An array specifying the directories where +# the mirror segment data will be stored. Each directory corresponds to a +# mirror segment. In this case, two mirror segments will be created in the +# same directory. +declare -a MIRROR_DATA_DIRECTORY=(/data1/mirror /data1/mirror) + +# Trusted shell - The shell program used for remote execution. Cloudberry uses +# SSH to run commands on other machines in the cluster. 'ssh' is the default. +TRUSTED_SHELL=ssh + +# Database encoding - The character set encoding to be used by the database. +# 'UNICODE' is a common choice, especially for internationalization. +ENCODING=UNICODE + +# Default database name - The name of the default database to be created during +# initialization. This is also the default database that the gpadmin user will +# connect to. +DATABASE_NAME=gpadmin + +# Machine list file - A file containing the list of hostnames where the primary +# segments will be created. Each line in the file represents a different machine. +# This file is critical for setting up the cluster across multiple nodes. +MACHINE_LIST_FILE=/home/gpadmin/hostfile_gpinitsystem + +# ---------------------------------------------------------------------- +# End of gpinitsystem Configuration File +# ---------------------------------------------------------------------- diff --git a/devops/deploy/docker/test/rocky9/configs/init_system.sh b/devops/deploy/docker/test/rocky9/configs/init_system.sh new file mode 100755 index 00000000000..3ea7e34b0ff --- /dev/null +++ b/devops/deploy/docker/test/rocky9/configs/init_system.sh @@ -0,0 +1,221 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Container Initialization Script +# -------------------------------------------------------------------- +# This script sets up the environment inside the Docker container for +# the Apache Cloudberry Build Environment. It performs the following +# tasks: +# +# 1. Verifies that the container is running with the expected hostname. +# 2. Starts the SSH daemon to allow SSH access to the container. +# 3. Configures passwordless SSH access for the 'gpadmin' user. +# 4. Sets up the necessary directories and configuration files for +# Apache Cloudberry. +# 5. Displays a welcome banner and system information. +# 6. Starts an interactive bash shell. +# +# This script is intended to be used as an entrypoint or initialization +# script for the Docker container. +# -------------------------------------------------------------------- + +# -------------------------------------------------------------------- +# Check if the hostname is 'cdw' +# -------------------------------------------------------------------- +# The script checks if the container's hostname is set to 'cdw'. This is +# a requirement for this environment, and if the hostname does not match, +# the script will exit with an error message. This ensures consistency +# across different environments. +# -------------------------------------------------------------------- +if [ "$(hostname)" != "cdw" ]; then + echo "Error: This container must be run with the hostname 'cdw'." + echo "Use the following command: docker run -h cdw ..." + exit 1 +fi + +# -------------------------------------------------------------------- +# Start SSH daemon and setup for SSH access +# -------------------------------------------------------------------- +# The SSH daemon is started to allow remote access to the container via +# SSH. This is useful for development and debugging purposes. If the SSH +# daemon fails to start, the script exits with an error. +# -------------------------------------------------------------------- +if ! sudo /usr/sbin/sshd; then + echo "Failed to start SSH daemon" >&2 + exit 1 +fi + +# -------------------------------------------------------------------- +# Remove /run/nologin to allow logins +# -------------------------------------------------------------------- +# The /run/nologin file, if present, prevents users from logging into +# the system. This file is removed to ensure that users can log in via SSH. +# -------------------------------------------------------------------- +sudo rm -rf /run/nologin + +# -------------------------------------------------------------------- +# Configure passwordless SSH access for 'gpadmin' user +# -------------------------------------------------------------------- +# The script sets up SSH key-based authentication for the 'gpadmin' user, +# allowing passwordless SSH access. It generates a new SSH key pair if one +# does not already exist, and configures the necessary permissions. +# -------------------------------------------------------------------- +mkdir -p /home/gpadmin/.ssh +chmod 700 /home/gpadmin/.ssh + +if [ ! -f /home/gpadmin/.ssh/id_rsa ]; then + ssh-keygen -t rsa -b 4096 -C gpadmin -f /home/gpadmin/.ssh/id_rsa -P "" > /dev/null 2>&1 +fi + +cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys +chmod 600 /home/gpadmin/.ssh/authorized_keys + +# Add the container's hostname to the known_hosts file to avoid SSH warnings +ssh-keyscan -t rsa cdw > /home/gpadmin/.ssh/known_hosts 2>/dev/null + +# -------------------------------------------------------------------- +# Cloudberry Data Directories Setup +# -------------------------------------------------------------------- +# The script sets up the necessary directories for Apache Cloudberry, +# including directories for the coordinator, standby coordinator, primary +# segments, and mirror segments. It also sets up the configuration files +# required for initializing the database. +# -------------------------------------------------------------------- +sudo rm -rf /data1/* +sudo mkdir -p /data1/coordinator /data1/standby_coordinator /data1/primary /data1/mirror +sudo chown -R gpadmin.gpadmin /data1 + +# Copy the gpinitsystem configuration file to the home directory +cp /tmp/gpinitsystem.conf /home/gpadmin + +# Set up the hostfile for cluster initialization +echo $(hostname) > /home/gpadmin/hostfile_gpinitsystem + +# Change to the home directory of the current user +cd $HOME + +# -------------------------------------------------------------------- +# Display a Welcome Banner +# -------------------------------------------------------------------- +# The following ASCII art and welcome message are displayed when the +# container starts. This banner provides a visual indication that the +# container is running in the Apache Cloudberry Build Environment. +# -------------------------------------------------------------------- +cat <<-'EOF' + +====================================================================== + + ++++++++++ ++++++ + ++++++++++++++ +++++++ + ++++ +++++ ++++ + ++++ +++++++++ + =+==== =============+ + ======== =====+ ===== + ==== ==== ==== ==== + ==== === === ==== + ==== === === ==== + ==== === ==-- === + ===== ===== -- ==== + ===================== ====== + ============================ + =-----= + ____ _ _ _ + / ___|| | ___ _ _ __| || |__ ___ _ __ _ __ _ _ + | | | | / _ \ | | | | / _` || '_ \ / _ \| '__|| '__|| | | | + | |___ | || (_) || |_| || (_| || |_) || __/| | | | | |_| | + \____||_| \____ \__,_| \__,_||_.__/ \___||_| |_| \__, | + |___/ +---------------------------------------------------------------------- + +EOF + +# -------------------------------------------------------------------- +# Display System Information +# -------------------------------------------------------------------- +# The script sources the /etc/os-release file to retrieve the operating +# system name and version. It then displays the following information: +# - OS name and version +# - Current user +# - Container hostname +# - IP address +# - CPU model name and number of cores +# - Total memory available +# - Cloudberry version (if installed) +# This information is useful for users to understand the environment they +# are working in. +# -------------------------------------------------------------------- +source /etc/os-release + +# First, create the CPU info detection function +get_cpu_info() { + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + lscpu | grep 'Model name:' | awk '{print substr($0, index($0,$3))}' + elif [ "$ARCH" = "aarch64" ]; then + VENDOR=$(lscpu | grep 'Vendor ID:' | awk '{print $3}') + if [ "$VENDOR" = "Apple" ] || [ "$VENDOR" = "0x61" ]; then + echo "Apple Silicon ($ARCH)" + else + if [ -f /proc/cpuinfo ]; then + IMPL=$(grep "CPU implementer" /proc/cpuinfo | head -1 | awk '{print $3}') + PART=$(grep "CPU part" /proc/cpuinfo | head -1 | awk '{print $3}') + if [ ! -z "$IMPL" ] && [ ! -z "$PART" ]; then + echo "ARM $ARCH (Implementer: $IMPL, Part: $PART)" + else + echo "ARM $ARCH" + fi + else + echo "ARM $ARCH" + fi + fi + else + echo "Unknown architecture: $ARCH" + fi +} + +# Check if Apache Cloudberry is installed and display its version +if rpm -q apache-cloudberry-db-incubating > /dev/null 2>&1; then + CBDB_VERSION=$(/usr/local/cbdb/bin/postgres --gp-version) +else + CBDB_VERSION="Not installed" +fi + +cat <<-EOF +Welcome to the Apache Cloudberry Test Environment! + +Cloudberry version .. : $CBDB_VERSION +Container OS ........ : $NAME $VERSION +User ................ : $(whoami) +Container hostname .. : $(hostname) +IP Address .......... : $(hostname -I | awk '{print $1}') +CPU Info ............ : $(get_cpu_info) +CPU(s) .............. : $(nproc) +Memory .............. : $(free -h | grep Mem: | awk '{print $2}') total +====================================================================== + +EOF + +# -------------------------------------------------------------------- +# Start an interactive bash shell +# -------------------------------------------------------------------- +# Finally, the script starts an interactive bash shell to keep the +# container running and allow the user to interact with the environment. +# -------------------------------------------------------------------- +/bin/bash diff --git a/devops/deploy/docker/test/ubuntu22.04/Dockerfile b/devops/deploy/docker/test/ubuntu22.04/Dockerfile new file mode 100644 index 00000000000..70c0f26be90 --- /dev/null +++ b/devops/deploy/docker/test/ubuntu22.04/Dockerfile @@ -0,0 +1,140 @@ + +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# +# Apache Cloudberry (incubating) is an effort undergoing incubation at +# the Apache Software Foundation (ASF), sponsored by the Apache +# Incubator PMC. +# +# Incubation is required of all newly accepted projects until a +# further review indicates that the infrastructure, communications, +# and decision making process have stabilized in a manner consistent +# with other successful ASF projects. +# +# While incubation status is not necessarily a reflection of the +# completeness or stability of the code, it does indicate that the +# project has yet to be fully endorsed by the ASF. +# +# -------------------------------------------------------------------- +# Dockerfile for Apache Cloudberry Base Environment +# -------------------------------------------------------------------- +# This Dockerfile sets up a Ubuntu jammy 22.04 -based container to serve as +# a base environment for evaluating the Apache Cloudberry. It installs +# necessary system utilities, configures the environment for SSH access, +# and sets up a 'gpadmin' user with sudo privileges. The Apache Cloudberry +# DEB can be installed into this container for testing and +# functional verification. +# +# Key Features: +# - Locale setup for en_US.UTF-8 +# - SSH daemon setup for remote access +# - Essential system utilities installation +# - Separate user creation and configuration steps +# +# Security Considerations: +# - This Dockerfile prioritizes ease of use for functional testing and +# evaluation. It includes configurations such as passwordless sudo access +# for the 'gpadmin' user and SSH access with password authentication. +# - These configurations are suitable for testing and development but +# should NOT be used in a production environment due to potential security +# risks. +# +# Usage: +# docker build -t cloudberry-db-base-env . +# docker run -h cdw -it cloudberry-db-base-env +# -------------------------------------------------------------------- + +FROM ubuntu:22.04 + +# Argument for configuring the timezone +ARG TIMEZONE_VAR="Europe/London" + +# Environment variables for locale and user +ENV container=docker +ENV LANG=en_US.UTF-8 +ENV USER=gpadmin +ENV TZ=${TIMEZONE_VAR} +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NOWARNINGS="yes" + +# -------------------------------------------------------------------- +# Install Development Tools and Utilities +# -------------------------------------------------------------------- + +RUN apt-get update && \ + apt-get install -y -qq \ + htop \ + bat \ + silversearcher-ag \ + vim \ + wget \ + gdb \ + git \ + iputils-ping \ + lsof \ + openssh-server \ + pkg-config \ + python3.10 \ + python3-distutils \ + python3-pip \ + python3-setuptools \ + sudo \ + tzdata && \ + apt-get install -y -qq locales && \ + locale-gen ${LANG} && \ + update-locale LANG=${LANG} && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3.10 /usr/bin/python + +# -------------------------------------------------------------------- +# User Creation and Configuration +# -------------------------------------------------------------------- +# - Create the 'gpadmin' user and group. +# - Configure the 'gpadmin' user with passwordless sudo privileges. +# - Add Cloudberry-specific entries to the gpadmin's .bashrc. +# -------------------------------------------------------------------- +RUN /usr/sbin/groupadd gpadmin && \ + /usr/sbin/useradd -m -g gpadmin gpadmin -s /bin/bash && \ + echo 'gpadmin ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/90-gpadmin && \ + chmod 0440 /etc/sudoers.d/90-gpadmin && \ + echo '\n# Add Cloudberry entries\nif [ -f /usr/local/cloudberry/cloudberry-env.sh ]; then\n source /usr/local/cloudberry/cloudberry-env.sh\n export COORDINATOR_DATA_DIRECTORY=/data1/coordinator/gpseg-1\nfi' >> /home/gpadmin/.bashrc + +# -------------------------------------------------------------------- +# Copy Configuration Files and Setup the Environment +# -------------------------------------------------------------------- +# - Copy custom configuration files from the build context to /tmp/. +# - Apply custom system limits and timezone. +# - Set up SSH for password-based authentication. +# - Generate locale and set the default locale to en_US.UTF-8. +# -------------------------------------------------------------------- +COPY ./configs/* /tmp/ + +RUN cp /tmp/90-cbdb-limits /etc/security/limits.d/90-cbdb-limits && \ + ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && \ + echo $TZ > /etc/timezone && \ + chmod 755 /tmp/init_system.sh && \ + ssh-keygen -A + +USER gpadmin +WORKDIR /home/gpadmin + +CMD ["bash","-c","/tmp/init_system.sh"] + diff --git a/devops/deploy/docker/test/ubuntu22.04/configs/90-cbdb-limits b/devops/deploy/docker/test/ubuntu22.04/configs/90-cbdb-limits new file mode 100644 index 00000000000..474957c42f6 --- /dev/null +++ b/devops/deploy/docker/test/ubuntu22.04/configs/90-cbdb-limits @@ -0,0 +1,32 @@ +# /etc/security/limits.d/90-db-limits +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +# Core dump file size limits for gpadmin +gpadmin soft core unlimited +gpadmin hard core unlimited + +# Open file limits for gpadmin +gpadmin soft nofile 524288 +gpadmin hard nofile 524288 + +# Process limits for gpadmin +gpadmin soft nproc 131072 +gpadmin hard nproc 131072 diff --git a/devops/deploy/docker/test/ubuntu22.04/configs/gpinitsystem.conf b/devops/deploy/docker/test/ubuntu22.04/configs/gpinitsystem.conf new file mode 100644 index 00000000000..896c8c79e54 --- /dev/null +++ b/devops/deploy/docker/test/ubuntu22.04/configs/gpinitsystem.conf @@ -0,0 +1,87 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# ---------------------------------------------------------------------- +# gpinitsystem Configuration File for Cloudberry Database +# ---------------------------------------------------------------------- +# This configuration file is used to initialize a Cloudberry Database +# cluster. It defines the settings for the coordinator, primary segments, +# and mirrors, as well as other important configuration options. +# ---------------------------------------------------------------------- + +# Segment prefix - This prefix is used for naming the segment directories. +# For example, the primary segment directories will be named gpseg0, gpseg1, etc. +SEG_PREFIX=gpseg + +# Coordinator port - The port number where the coordinator will listen. +# This is the port used by clients to connect to the database. +COORDINATOR_PORT=5432 + +# Coordinator hostname - The hostname of the machine where the coordinator +# will be running. The $(hostname) command will automatically insert the +# hostname of the current machine. +COORDINATOR_HOSTNAME=$(hostname) + +# Coordinator data directory - The directory where the coordinator's data +# will be stored. This directory should have enough space to store metadata +# and system catalogs. +COORDINATOR_DIRECTORY=/data1/coordinator + +# Base port for primary segments - The starting port number for the primary +# segments. Each primary segment will use a unique port number starting from +# this base. +PORT_BASE=6000 + +# Primary segment data directories - An array specifying the directories where +# the primary segment data will be stored. Each directory corresponds to a +# primary segment. In this case, two primary segments will be created in the +# same directory. +declare -a DATA_DIRECTORY=(/data1/primary /data1/primary) + +# Base port for mirror segments - The starting port number for the mirror +# segments. Each mirror segment will use a unique port number starting from +# this base. +MIRROR_PORT_BASE=7000 + +# Mirror segment data directories - An array specifying the directories where +# the mirror segment data will be stored. Each directory corresponds to a +# mirror segment. In this case, two mirror segments will be created in the +# same directory. +declare -a MIRROR_DATA_DIRECTORY=(/data1/mirror /data1/mirror) + +# Trusted shell - The shell program used for remote execution. Cloudberry uses +# SSH to run commands on other machines in the cluster. 'ssh' is the default. +TRUSTED_SHELL=ssh + +# Database encoding - The character set encoding to be used by the database. +# 'UNICODE' is a common choice, especially for internationalization. +ENCODING=UNICODE + +# Default database name - The name of the default database to be created during +# initialization. This is also the default database that the gpadmin user will +# connect to. +DATABASE_NAME=gpadmin + +# Machine list file - A file containing the list of hostnames where the primary +# segments will be created. Each line in the file represents a different machine. +# This file is critical for setting up the cluster across multiple nodes. +MACHINE_LIST_FILE=/home/gpadmin/hostfile_gpinitsystem + +# ---------------------------------------------------------------------- +# End of gpinitsystem Configuration File +# ---------------------------------------------------------------------- diff --git a/devops/deploy/docker/test/ubuntu22.04/configs/init_system.sh b/devops/deploy/docker/test/ubuntu22.04/configs/init_system.sh new file mode 100644 index 00000000000..1928fe511d9 --- /dev/null +++ b/devops/deploy/docker/test/ubuntu22.04/configs/init_system.sh @@ -0,0 +1,224 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Container Initialization Script +# -------------------------------------------------------------------- +# This script sets up the environment inside the Docker container for +# the Apache Cloudberry Build Environment. It performs the following +# tasks: +# +# 1. Verifies that the container is running with the expected hostname. +# 2. Starts the SSH daemon to allow SSH access to the container. +# 3. Configures passwordless SSH access for the 'gpadmin' user. +# 4. Sets up the necessary directories and configuration files for +# Apache Cloudberry. +# 5. Displays a welcome banner and system information. +# 6. Starts an interactive bash shell. +# +# This script is intended to be used as an entrypoint or initialization +# script for the Docker container. +# -------------------------------------------------------------------- + +# -------------------------------------------------------------------- +# Check if the hostname is 'cdw' +# -------------------------------------------------------------------- +# The script checks if the container's hostname is set to 'cdw'. This is +# a requirement for this environment, and if the hostname does not match, +# the script will exit with an error message. This ensures consistency +# across different environments. +# -------------------------------------------------------------------- +if [ "$(hostname)" != "cdw" ]; then + echo "Error: This container must be run with the hostname 'cdw'." + echo "Use the following command: docker run -h cdw ..." + exit 1 +fi + +# -------------------------------------------------------------------- +# Start SSH daemon and setup for SSH access +# -------------------------------------------------------------------- +# The SSH daemon is started to allow remote access to the container via +# SSH. This is useful for development and debugging purposes. If the SSH +# daemon fails to start, the script exits with an error. +# -------------------------------------------------------------------- +sudo mkdir -p /run/sshd +sudo chmod 755 /run/sshd + +if ! sudo /usr/sbin/sshd; then + echo "Failed to start SSH daemon" >&2 + exit 1 +fi + +# -------------------------------------------------------------------- +# Remove /run/nologin to allow logins +# -------------------------------------------------------------------- +# The /run/nologin file, if present, prevents users from logging into +# the system. This file is removed to ensure that users can log in via SSH. +# -------------------------------------------------------------------- +sudo rm -rf /run/nologin + +# -------------------------------------------------------------------- +# Configure passwordless SSH access for 'gpadmin' user +# -------------------------------------------------------------------- +# The script sets up SSH key-based authentication for the 'gpadmin' user, +# allowing passwordless SSH access. It generates a new SSH key pair if one +# does not already exist, and configures the necessary permissions. +# -------------------------------------------------------------------- +mkdir -p /home/gpadmin/.ssh +chmod 700 /home/gpadmin/.ssh + +if [ ! -f /home/gpadmin/.ssh/id_rsa ]; then + ssh-keygen -t rsa -b 4096 -C gpadmin -f /home/gpadmin/.ssh/id_rsa -P "" > /dev/null 2>&1 +fi + +cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys +chmod 600 /home/gpadmin/.ssh/authorized_keys + +# Add the container's hostname to the known_hosts file to avoid SSH warnings +ssh-keyscan -t rsa cdw > /home/gpadmin/.ssh/known_hosts 2>/dev/null + +# -------------------------------------------------------------------- +# Cloudberry Data Directories Setup +# -------------------------------------------------------------------- +# The script sets up the necessary directories for Apache Cloudberry, +# including directories for the coordinator, standby coordinator, primary +# segments, and mirror segments. It also sets up the configuration files +# required for initializing the database. +# -------------------------------------------------------------------- +sudo rm -rf /data1/* +sudo mkdir -p /data1/coordinator /data1/standby_coordinator /data1/primary /data1/mirror +sudo chown -R gpadmin.gpadmin /data1 + +# Copy the gpinitsystem configuration file to the home directory +cp /tmp/gpinitsystem.conf /home/gpadmin + +# Set up the hostfile for cluster initialization +echo $(hostname) > /home/gpadmin/hostfile_gpinitsystem + +# Change to the home directory of the current user +cd $HOME + +# -------------------------------------------------------------------- +# Display a Welcome Banner +# -------------------------------------------------------------------- +# The following ASCII art and welcome message are displayed when the +# container starts. This banner provides a visual indication that the +# container is running in the Apache Cloudberry Build Environment. +# -------------------------------------------------------------------- +cat <<-'EOF' + +====================================================================== + + ++++++++++ ++++++ + ++++++++++++++ +++++++ + ++++ +++++ ++++ + ++++ +++++++++ + =+==== =============+ + ======== =====+ ===== + ==== ==== ==== ==== + ==== === === ==== + ==== === === ==== + ==== === ==-- === + ===== ===== -- ==== + ===================== ====== + ============================ + =-----= + ____ _ _ _ + / ___|| | ___ _ _ __| || |__ ___ _ __ _ __ _ _ + | | | | / _ \ | | | | / _` || '_ \ / _ \| '__|| '__|| | | | + | |___ | || (_) || |_| || (_| || |_) || __/| | | | | |_| | + \____||_| \____ \__,_| \__,_||_.__/ \___||_| |_| \__, | + |___/ +---------------------------------------------------------------------- + +EOF + +# -------------------------------------------------------------------- +# Display System Information +# -------------------------------------------------------------------- +# The script sources the /etc/os-release file to retrieve the operating +# system name and version. It then displays the following information: +# - OS name and version +# - Current user +# - Container hostname +# - IP address +# - CPU model name and number of cores +# - Total memory available +# - Cloudberry version (if installed) +# This information is useful for users to understand the environment they +# are working in. +# -------------------------------------------------------------------- +source /etc/os-release + +# First, create the CPU info detection function +get_cpu_info() { + ARCH=$(uname -m) + if [ "$ARCH" = "x86_64" ]; then + lscpu | grep 'Model name:' | awk '{print substr($0, index($0,$3))}' + elif [ "$ARCH" = "aarch64" ]; then + VENDOR=$(lscpu | grep 'Vendor ID:' | awk '{print $3}') + if [ "$VENDOR" = "Apple" ] || [ "$VENDOR" = "0x61" ]; then + echo "Apple Silicon ($ARCH)" + else + if [ -f /proc/cpuinfo ]; then + IMPL=$(grep "CPU implementer" /proc/cpuinfo | head -1 | awk '{print $3}') + PART=$(grep "CPU part" /proc/cpuinfo | head -1 | awk '{print $3}') + if [ ! -z "$IMPL" ] && [ ! -z "$PART" ]; then + echo "ARM $ARCH (Implementer: $IMPL, Part: $PART)" + else + echo "ARM $ARCH" + fi + else + echo "ARM $ARCH" + fi + fi + else + echo "Unknown architecture: $ARCH" + fi +} + +# Check if Apache Cloudberry is installed and display its version +if dpkg -l apache-cloudberry-db-incubating > /dev/null 2>&1; then + CBDB_VERSION=$(/usr/local/cbdb/bin/postgres --gp-version) +else + CBDB_VERSION="Not installed" +fi + +cat <<-EOF +Welcome to the Apache Cloudberry Test Environment! + +Cloudberry version .. : $CBDB_VERSION +Container OS ........ : $NAME $VERSION +User ................ : $(whoami) +Container hostname .. : $(hostname) +IP Address .......... : $(hostname -I | awk '{print $1}') +CPU Info ............ : $(get_cpu_info) +CPU(s) .............. : $(nproc) +Memory .............. : $(free -h | grep Mem: | awk '{print $2}') total +====================================================================== + +EOF + +# -------------------------------------------------------------------- +# Start an interactive bash shell +# -------------------------------------------------------------------- +# Finally, the script starts an interactive bash shell to keep the +# container running and allow the user to interact with the environment. +# -------------------------------------------------------------------- +/bin/bash diff --git a/devops/release/cloudberry-release.sh b/devops/release/cloudberry-release.sh new file mode 100755 index 00000000000..5fd579b481e --- /dev/null +++ b/devops/release/cloudberry-release.sh @@ -0,0 +1,496 @@ +#!/usr/bin/env bash +# ====================================================================== +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ====================================================================== +# +# cloudberry-release.sh — Apache Cloudberry (Incubating) release utility +# +# This script automates the preparation of an Apache Cloudberry release +# candidate, including version validation, tag creation, and source +# tarball assembly. +# +# Supported Features: +# - Validates version consistency across configure.ac, configure, gpversion.py, and pom.xml +# - Supports both final releases and release candidates (e.g., 2.0.0-incubating, 2.0.0-incubating-rc1) +# - Optionally reuses existing annotated Git tags if they match the current HEAD +# - Verifies that Git submodules are initialized (if defined in .gitmodules) +# - Verifies Git identity (user.name and user.email) prior to tagging +# - Creates a BUILD_NUMBER file (currently hardcoded as 1) in the release tarball +# - Recursively archives all submodules into the source tarball +# - Generates SHA-512 checksum (.sha512) for the source tarball +# - Generates GPG signature (.asc) for the source tarball, unless --skip-signing is used +# - Moves signed artifacts into a dedicated artifacts/ directory +# - Verifies integrity and authenticity of artifacts via SHA-512 checksum and GPG signature +# - Allows skipping of upstream remote URL validation (e.g., for forks) via --skip-remote-check +# +# Usage: +# ./cloudberry-release.sh --stage --tag 2.0.0-incubating-rc1 --gpg-user your@apache.org +# +# Options: +# -s, --stage Stage a release candidate and generate source tarball +# -t, --tag Tag to apply or validate (e.g., 2.0.0-incubating-rc1) +# -f, --force-tag-reuse Allow reuse of an existing tag (must match HEAD) +# -r, --repo Optional path to local Cloudberry Git repository +# -S, --skip-remote-check Skip validation of remote.origin.url (useful for forks/mirrors) +# -g, --gpg-user GPG key ID or email to use for signing (required) +# -k, --skip-signing Skip GPG key validation and signature generation +# -h, --help Show usage and exit +# +# Requirements: +# - Must be run from the root of a valid Apache Cloudberry Git clone, +# or the path must be explicitly provided using --repo +# - Git user.name and user.email must be configured +# - Repository remote must be: git@github.com:apache/cloudberry.git +# +# Examples: +# ./cloudberry-release.sh -s -t 2.0.0-incubating-rc1 --gpg-user your@apache.org +# ./cloudberry-release.sh -s -t 2.0.0-incubating-rc1 --skip-signing +# ./cloudberry-release.sh --stage --tag 2.0.0-incubating-rc2 --force-tag-reuse --gpg-user your@apache.org +# ./cloudberry-release.sh --stage --tag 2.0.0-incubating-rc1 -r ~/cloudberry --skip-remote-check --gpg-user your@apache.org +# +# Notes: +# - When reusing a tag, the `--force-tag-reuse` flag must be provided. +# - This script creates a BUILD_NUMBER file in the source root for traceability. It is included in the tarball. +# ====================================================================== + +set -euo pipefail + +confirm() { + read -r -p "$1 [y/N] " response + case "$response" in + [yY][eE][sS]|[yY]) true ;; + *) echo "Aborted."; exit 1 ;; + esac +} + +section() { + echo + echo "=================================================================" + echo ">> $1" + echo "=================================================================" +} + +show_help() { + echo "Apache Cloudberry (Incubating) Release Tool" + echo + echo "Usage:" + echo " $0 --stage --tag " + echo + echo "Options:" + echo " -s, --stage" + echo " Stage a release candidate and generate source tarball" + echo + echo " -t, --tag " + echo " Required with --stage (e.g., 2.0.0-incubating-rc1)" + echo + echo " -f, --force-tag-reuse" + echo " Reuse existing tag if it matches current HEAD" + echo + echo " -r, --repo " + echo " Optional path to a local Cloudberry Git repository clone" + echo + echo " -S, --skip-remote-check" + echo " Skip remote.origin.url check (use for forks or mirrors)" + echo " Required for official releases:" + echo " git@github.com:apache/cloudberry.git" + echo + echo " -g, --gpg-user " + echo " GPG key ID or email to use for signing (required unless --skip-signing)" + echo + echo " -k, --skip-signing" + echo " Skip GPG key validation and signature generation" + echo + echo " -h, --help" + echo " Show this help message" + exit 1 +} + +# Flags +STAGE=false +SKIP_SIGNING=false +TAG="" +FORCE_TAG_REUSE=false +REPO_ARG="" +SKIP_REMOTE_CHECK=false +GPG_USER="" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + -g|--gpg-user) + if [[ $# -lt 2 ]]; then + echo "ERROR: --gpg-user requires an email." >&2 + show_help + fi + GPG_USER="$2" + shift 2 + ;; + -s|--stage) + STAGE=true + shift + ;; + -t|--tag) + if [[ $# -lt 2 ]]; then + echo "ERROR: Missing tag value after --tag" >&2 + show_help + fi + TAG="$2" + shift 2 + ;; + -f|--force-tag-reuse) + FORCE_TAG_REUSE=true + shift + ;; + -r|--repo) + if [[ $# -lt 2 ]]; then + echo "ERROR: --repo requires a path." >&2 + show_help + fi + REPO_ARG="$2" + shift 2 + ;; + -S|--skip-remote-check) + SKIP_REMOTE_CHECK=true + shift + ;; + -k|--skip-signing) + SKIP_SIGNING=true + shift + ;; + -h|--help) + show_help + ;; + *) + echo "ERROR: Unknown option: $1" >&2 + show_help + ;; + esac +done + +# GPG signing checks +if [[ "$SKIP_SIGNING" != true ]]; then + if [[ -z "$GPG_USER" ]]; then + echo "ERROR: --gpg-user is required for signing the release tarball." >&2 + show_help + fi + + if ! gpg --list-keys "$GPG_USER" > /dev/null 2>&1; then + echo "ERROR: GPG key '$GPG_USER' not found in your local keyring." >&2 + echo "Please import or generate the key before proceeding." >&2 + exit 1 + fi +else + echo "INFO: GPG signing has been intentionally skipped (--skip-signing)." +fi + +if [[ -n "$REPO_ARG" ]]; then + if [[ -n "$REPO_ARG" ]]; then + if [[ ! -d "$REPO_ARG" || ! -f "$REPO_ARG/configure.ac" ]]; then + echo "ERROR: '$REPO_ARG' does not appear to be a valid Cloudberry source directory." + echo "Expected to find a 'configure.ac' file but it is missing." + echo + echo "Hint: Make sure you passed the correct --repo path to a valid Git clone." + exit 1 + fi + cd "$REPO_ARG" + elif [[ ! -f configure.ac ]]; then + echo "ERROR: No Cloudberry source directory specified and no 'configure.ac' found in the current directory." + echo + echo "Hint: Either run this script from the root of a Cloudberry Git clone," + echo "or use the --repo option to specify the source directory." + exit 1 + fi + cd "$REPO_ARG" + + if [[ ! -d ".git" ]]; then + echo "ERROR: '$REPO_ARG' is not a valid Git repository." + exit 1 + fi + + if [[ "$SKIP_REMOTE_CHECK" != true ]]; then + REMOTE_URL=$(git config --get remote.origin.url || true) + if [[ "$REMOTE_URL" != "git@github.com:apache/cloudberry.git" ]]; then + echo "ERROR: remote.origin.url must be set to 'git@github.com:apache/cloudberry.git' for official releases." + echo " Found: '${REMOTE_URL:-}'" + echo + echo "This check ensures the release is being staged from the authoritative upstream repository." + echo "Use --skip-remote-check only if this is a fork or non-release automation." + exit 1 + fi + fi +fi + +# If --repo was not provided, ensure we are in a valid source directory +if [[ -z "$REPO_ARG" ]]; then + if [[ ! -f configure.ac || ! -f gpMgmt/bin/gppylib/gpversion.py || ! -f pom.xml ]]; then + echo "ERROR: You must run this script from the root of a valid Cloudberry Git clone" + echo " or pass the path using --repo ." + echo + echo "Missing one or more expected files:" + echo " - configure.ac" + echo " - gpMgmt/bin/gppylib/gpversion.py" + echo " - pom.xml" + exit 1 + fi +fi + +if ! $STAGE && [[ -z "$TAG" ]]; then + show_help +fi + +if $STAGE && [[ -z "$TAG" ]]; then + echo "ERROR: --tag (-t) is required when using --stage." >&2 + show_help +fi + +section "Validating Version Consistency" + +# Extract version from configure.ac +CONFIGURE_AC_VERSION=$(grep "^AC_INIT" configure.ac | sed -E "s/^AC_INIT\(\[[^]]+\], \[([^]]+)\].*/\1/") +CONFIGURE_AC_MAJOR=$(echo "$CONFIGURE_AC_VERSION" | cut -d. -f1) +EXPECTED="[$CONFIGURE_AC_MAJOR,99]" + +# Validate tag format +SEMVER_REGEX="^${CONFIGURE_AC_MAJOR}\\.[0-9]+\\.[0-9]+(-incubating(-rc[0-9]+)?)?$" +if ! [[ "$TAG" =~ $SEMVER_REGEX ]]; then + echo "ERROR: Tag '$TAG' does not match expected pattern for major version $CONFIGURE_AC_MAJOR (e.g., ${CONFIGURE_AC_MAJOR}.0.0-incubating or ${CONFIGURE_AC_MAJOR}.0.0-incubating-rc1)" + exit 1 +fi + +# Check gpversion.py consistency +PY_LINE=$(grep "^MAIN_VERSION" gpMgmt/bin/gppylib/gpversion.py | sed -E 's/#.*//' | tr -d '[:space:]') + +if [[ "$PY_LINE" != "MAIN_VERSION=$EXPECTED" ]]; then + echo "ERROR: gpversion.py MAIN_VERSION is $PY_LINE, but configure.ac suggests $EXPECTED" + echo "Please correct this mismatch before proceeding." + exit 1 +fi + +# For final releases (non-RC), ensure configure.ac version matches tag exactly +if [[ "$TAG" != *-rc* && "$CONFIGURE_AC_VERSION" != "$TAG" ]]; then + echo "ERROR: configure.ac version ($CONFIGURE_AC_VERSION) does not match final release tag ($TAG)" + echo "Please update configure.ac to match the tag before proceeding." + exit 1 +fi + +# Ensure the generated 'configure' script is up to date +CONFIGURE_VERSION_LINE=$(grep "^PACKAGE_VERSION=" configure || true) +CONFIGURE_VERSION=$(echo "$CONFIGURE_VERSION_LINE" | sed -E "s/^PACKAGE_VERSION='([^']+)'.*/\1/") + +if [[ "$CONFIGURE_VERSION" != "$TAG" ]]; then + echo "ERROR: Version in generated 'configure' script ($CONFIGURE_VERSION) does not match release tag ($TAG)." + echo "This likely means autoconf was not run after updating configure.ac." + exit 1 +fi + +# Ensure xmllint is available +if ! command -v xmllint >/dev/null 2>&1; then + echo "ERROR: xmllint is required but not installed." + exit 1 +fi + +# Extract version from pom.xml using xmllint with namespace stripping +POM_VERSION=$(xmllint --xpath '//*[local-name()="project"]/*[local-name()="version"]/text()' pom.xml 2>/dev/null || true) + +if [[ -z "$POM_VERSION" ]]; then + echo "ERROR: Could not extract from pom.xml" + exit 1 +fi + +if [[ "$POM_VERSION" != "$TAG" ]]; then + echo "ERROR: Version in pom.xml ($POM_VERSION) does not match release tag ($TAG)." + echo "Please update pom.xml before tagging." + exit 1 +fi + +# Ensure working tree is clean +if ! git diff-index --quiet HEAD --; then + echo "ERROR: Working tree is not clean. Please commit or stash changes before proceeding." + exit 1 +fi + +echo "MAIN_VERSION verified" +printf " %-14s: %s\n" "Release Tag" "$TAG" +printf " %-14s: %s\n" "configure.ac" "$CONFIGURE_AC_VERSION" +printf " %-14s: %s\n" "configure" "$CONFIGURE_VERSION" +printf " %-14s: %s\n" "pom.xml" "$POM_VERSION" +printf " %-14s: %s\n" "gpversion.py" "${EXPECTED//[\[\]]}" + +section "Checking the state of the Tag" + +# Check if the tag already exists before making any changes +if git rev-parse "$TAG" >/dev/null 2>&1; then + TAG_COMMIT=$(git rev-list -n 1 "$TAG") + HEAD_COMMIT=$(git rev-parse HEAD) + + if [[ "$TAG_COMMIT" == "$HEAD_COMMIT" && "$FORCE_TAG_REUSE" == true ]]; then + echo "INFO: Tag '$TAG' already exists and matches HEAD. Proceeding with reuse." + elif [[ "$FORCE_TAG_REUSE" == true ]]; then + echo "ERROR: --force-tag-reuse was specified but tag '$TAG' does not match HEAD." + echo " Tags must be immutable. Cannot continue." + exit 1 + else + echo "ERROR: Tag '$TAG' already exists and does not match HEAD." + echo " Use --force-tag-reuse only when HEAD matches the tag commit." + exit 1 + fi +elif [[ "$FORCE_TAG_REUSE" == true ]]; then + echo "ERROR: --force-tag-reuse was specified, but tag '$TAG' does not exist." + echo " You can only reuse a tag if it already exists." + exit 1 +else + echo "INFO: Tag '$TAG' does not yet exist. It will be created during staging." +fi + +# Check and display submodule initialization status +if [ -s .gitmodules ]; then + section "Checking Git Submodules" + + UNINITIALIZED=false + while read -r status path rest; do + if [[ "$status" == "-"* ]]; then + echo "Uninitialized: $path" + UNINITIALIZED=true + else + echo "Initialized : $path" + fi + done < <(git submodule status) + + if [[ "$UNINITIALIZED" == true ]]; then + echo + echo "ERROR: One or more Git submodules are not initialized." + echo "Please run:" + echo " git submodule update --init --recursive" + echo "before proceeding with the release preparation." + exit 1 + fi +fi + +section "Checking GIT_USER_NAME and GIT_USER_EMAIL values" + +if $STAGE; then + # Validate Git environment before performing tag operation + GIT_USER_NAME=$(git config --get user.name || true) + GIT_USER_EMAIL=$(git config --get user.email || true) + + echo "Git User Info:" + printf " %-14s: %s\n" "user.name" "${GIT_USER_NAME:-}" + printf " %-14s: %s\n" "user.email" "${GIT_USER_EMAIL:-}" + + if [[ -z "$GIT_USER_NAME" || -z "$GIT_USER_EMAIL" ]]; then + echo "ERROR: Git configuration is incomplete." + echo + echo " Detected:" + echo " user.name = ${GIT_USER_NAME:-}" + echo " user.email = ${GIT_USER_EMAIL:-}" + echo + echo " Git requires both to be set in order to create annotated tags for releases." + echo " You may configure them globally using:" + echo " git config --global user.name \"Your Name\"" + echo " git config --global user.email \"your@apache.org\"" + echo + echo " Alternatively, set them just for this repo using the same commands without --global." + exit 1 + fi + +section "Staging release: $TAG" + + if [[ "$FORCE_TAG_REUSE" == false ]]; then + confirm "You are about to create tag '$TAG'. Continue?" + git tag -a "$TAG" -m "Apache Cloudberry (Incubating) ${TAG} Release Candidate" + else + echo "INFO: Reusing existing tag '$TAG'; skipping tag creation." + fi + + echo "Creating BUILD_NUMBER file with value of 1" + echo "1" > BUILD_NUMBER + + echo -e "\nTag Summary" + TAG_OBJECT=$(git rev-parse "$TAG") + TAG_COMMIT=$(git rev-list -n 1 "$TAG") + echo "$TAG (tag object): $TAG_OBJECT" + echo " Points to commit: $TAG_COMMIT" + git log -1 --format="%C(auto)%h %d" "$TAG" + + section "Creating Source Tarball" + + TAR_NAME="apache-cloudberry-${TAG}-src.tar.gz" + TMP_DIR=$(mktemp -d) + trap 'rm -rf "$TMP_DIR"' EXIT + + git archive --format=tar --prefix="apache-cloudberry-${TAG}/" "$TAG" | tar -x -C "$TMP_DIR" + cp BUILD_NUMBER "$TMP_DIR/apache-cloudberry-${TAG}/" + + # Archive submodules if any + if [ -s .gitmodules ]; then + git submodule foreach --recursive --quiet " + echo \"Archiving submodule: \$sm_path\" + fullpath=\"\$toplevel/\$sm_path\" + destpath=\"$TMP_DIR/apache-cloudberry-$TAG/\$sm_path\" + mkdir -p \"\$destpath\" + git -C \"\$fullpath\" archive --format=tar --prefix=\"\$sm_path/\" HEAD | tar -x -C \"$TMP_DIR/apache-cloudberry-$TAG\" + " + fi + + tar -czf "$TAR_NAME" -C "$TMP_DIR" "apache-cloudberry-${TAG}" + rm -rf "$TMP_DIR" + echo -e "Archive saved to: $TAR_NAME" + + # Generate SHA-512 checksum + section "Generating SHA-512 Checksum" + + echo -e "\nGenerating SHA-512 checksum" + shasum -a 512 "$TAR_NAME" > "${TAR_NAME}.sha512" + echo "Checksum saved to: ${TAR_NAME}.sha512" + + section "Signing with GPG key: $GPG_USER" + # Conditionally generate GPG signature + if [[ "$SKIP_SIGNING" != true ]]; then + echo -e "\nSigning tarball with GPG key: $GPG_USER" + gpg --armor --detach-sign --local-user "$GPG_USER" "$TAR_NAME" + echo "GPG signature saved to: ${TAR_NAME}.asc" + else + echo "INFO: Skipping tarball signing as requested (--skip-signing)" + fi + + # Move artifacts to top-level artifacts directory + + ARTIFACTS_DIR="$(cd "$(dirname "$REPO_ARG")" && cd .. && pwd)/artifacts" + mkdir -p "$ARTIFACTS_DIR" + + section "Moving Artifacts to $ARTIFACTS_DIR" + + echo -e "\nMoving release artifacts to: $ARTIFACTS_DIR" + mv -vf "$TAR_NAME" "$ARTIFACTS_DIR/" + mv -vf "${TAR_NAME}.sha512" "$ARTIFACTS_DIR/" + [[ -f "${TAR_NAME}.asc" ]] && mv -vf "${TAR_NAME}.asc" "$ARTIFACTS_DIR/" + + section "Verifying sha512 ($ARTIFACTS_DIR/${TAR_NAME}.sha512) Release Artifact" + cd "$ARTIFACTS_DIR" + sha512sum -c "$ARTIFACTS_DIR/${TAR_NAME}.sha512" + + section "Verifying GPG Signature ($ARTIFACTS_DIR/${TAR_NAME}.asc) Release Artifact" + + if [[ "$SKIP_SIGNING" != true ]]; then + gpg --verify "${TAR_NAME}.asc" "$TAR_NAME" + else + echo "INFO: Signature verification skipped (--skip-signing). Signature is only available when generated via this script." + fi + + section "Release candidate for $TAG staged successfully" +fi diff --git a/devops/release/generate-changelog.sh b/devops/release/generate-changelog.sh new file mode 100755 index 00000000000..9977971179c --- /dev/null +++ b/devops/release/generate-changelog.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# ====================================================================== +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ====================================================================== +# +# Generate changelog between two git references (tags/commits) +# Usage: ./generate-changelog.sh [repo_owner/repo_name] +# +# Examples: +# ./generate-changelog.sh 1a40e1f 8178d4f +# ./generate-changelog.sh v1.0.0 v1.1.0 +# ./generate-changelog.sh 1a40e1f 8178d4f apache/cloudberry +# ./generate-changelog.sh v1.0.0 v1.1.0 apache/cloudberry +# +# GitHub Token Setup: +# 1. Go to GitHub Settings > Developer settings > Personal access tokens > Tokens (classic) +# 2. Generate new token with 'repo' scope (for private repos) or 'public_repo' scope (for public repos) +# 3. Export the token: export GITHUB_TOKEN=your_token_here + +set -e + +# Default values +DEFAULT_REPO="apache/cloudberry" +GITHUB_API_BASE="https://api.github.com" + +# Function to display usage +usage() { + echo "Usage: $0 [repo_owner/repo_name]" + echo "" + echo "Examples:" + echo " $0 1a40e1f 8178d4f" + echo " $0 v1.0.0 v1.1.0" + echo " $0 1a40e1f 8178d4f apache/cloudberry" + echo " $0 v1.0.0 v1.1.0 apache/cloudberry" + echo "" + echo "Environment variables:" + echo " GITHUB_TOKEN - GitHub personal access token (required)" + exit 1 +} + +# Check arguments +if [ $# -lt 2 ] || [ $# -gt 3 ]; then + usage +fi + +FROM_REF="$1" +TO_REF="$2" +REPO="${3:-$DEFAULT_REPO}" + +# Check if GITHUB_TOKEN is set +if [ -z "$GITHUB_TOKEN" ]; then + echo "Error: GITHUB_TOKEN environment variable is required" + echo "Please set it with: export GITHUB_TOKEN=your_token_here" + exit 1 +fi + +# Check if jq is installed +if ! command -v jq &> /dev/null; then + echo "Error: jq is required but not installed" + echo "Please install jq using your system's package manager" + exit 1 +fi + +# Count total commits first +total_commits=$(git log --oneline "$FROM_REF..$TO_REF" | wc -l | tr -d ' ') + +echo "Generating changelog for $REPO from $FROM_REF to $TO_REF..." +echo "Found $total_commits commits to process" +echo "" + +# Generate changelog +git log --oneline --pretty=format:"%h|%H|%s|%an" "$FROM_REF..$TO_REF" | while IFS='|' read -r short_sha full_sha subject author; do + # Get PR number for this commit + pr_number=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \ + "$GITHUB_API_BASE/repos/$REPO/commits/$full_sha/pulls" | \ + jq -r '.[0].number // empty') + + if [ -n "$pr_number" ]; then + echo "* [\`$short_sha\`](https://github.com/$REPO/commit/$full_sha) - $subject ($author) [#$pr_number](https://github.com/$REPO/pull/$pr_number)" + else + echo "* [\`$short_sha\`](https://github.com/$REPO/commit/$full_sha) - $subject ($author)" + fi +done + +echo "" +echo "Changelog generation completed!" diff --git a/devops/sandbox/.env b/devops/sandbox/.env new file mode 100644 index 00000000000..233d7c5b1b5 --- /dev/null +++ b/devops/sandbox/.env @@ -0,0 +1,21 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +CODEBASE_VERSION=2.0.0 +OS_VERSION=rockylinux9 diff --git a/devops/sandbox/Dockerfile.RELEASE.rockylinux9 b/devops/sandbox/Dockerfile.RELEASE.rockylinux9 new file mode 100644 index 00000000000..f9f422f57f6 --- /dev/null +++ b/devops/sandbox/Dockerfile.RELEASE.rockylinux9 @@ -0,0 +1,287 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Multi-stage Dockerfile for Apache Cloudberry Sandbox Environment (Release) +# -------------------------------------------------------------------- +# This Dockerfile compiles and installs a specific release version of +# Cloudberry, then creates a runtime environment for testing and development. +# -------------------------------------------------------------------- + +# -------------------------------------------------------------------- +# Build stage: Rocky Linux 9 builder to compile Cloudberry (release tarball) +# -------------------------------------------------------------------- +FROM rockylinux/rockylinux:9.6 AS builder + +# Install build toolchains and development headers (avoid coreutils/curl conflicts on arm64) +RUN dnf makecache && \ + dnf install -y \ + epel-release \ + git && \ + dnf config-manager --disable epel-cisco-openh264 && \ + dnf makecache && \ + dnf config-manager --disable epel && \ + dnf install -y --enablerepo=epel \ + the_silver_searcher \ + bat \ + htop && \ + dnf install -y \ + bison \ + cmake3 \ + ed \ + file \ + flex \ + gcc \ + gcc-c++ \ + gdb \ + glibc-langpack-en \ + glibc-locale-source \ + initscripts \ + iproute \ + less \ + lsof \ + m4 \ + net-tools \ + openssh-clients \ + openssh-server \ + perl \ + rpm-build \ + rpmdevtools \ + rsync \ + sudo \ + tar \ + unzip \ + util-linux-ng \ + wget \ + sshpass \ + which && \ + dnf install -y \ + apr-devel \ + bzip2-devel \ + java-11-openjdk \ + java-11-openjdk-devel \ + krb5-devel \ + libcurl-devel \ + libevent-devel \ + libxml2-devel \ + libuuid-devel \ + libzstd-devel \ + lz4 \ + lz4-devel \ + openldap-devel \ + openssl-devel \ + pam-devel \ + perl-ExtUtils-Embed \ + perl-Test-Simple \ + perl-core \ + python3-devel \ + python3-pytest \ + readline-devel \ + zlib-devel && \ + dnf install -y --enablerepo=crb \ + libuv-devel \ + libyaml-devel \ + perl-IPC-Run \ + protobuf-devel && \ + dnf clean all && \ + cd && XERCES_LATEST_RELEASE=3.3.0 && \ + wget -nv "https://archive.apache.org/dist/xerces/c/3/sources/xerces-c-${XERCES_LATEST_RELEASE}.tar.gz" && \ + echo "$(curl -sL https://archive.apache.org/dist/xerces/c/3/sources/xerces-c-${XERCES_LATEST_RELEASE}.tar.gz.sha256)" | sha256sum -c - && \ + tar xf "xerces-c-${XERCES_LATEST_RELEASE}.tar.gz"; rm "xerces-c-${XERCES_LATEST_RELEASE}.tar.gz" && \ + cd xerces-c-${XERCES_LATEST_RELEASE} && \ + ./configure --prefix=/usr/local/xerces-c && \ + make -j$(nproc) && \ + make install -C ~/xerces-c-${XERCES_LATEST_RELEASE} && \ + rm -rf ~/xerces-c* + +# Create gpadmin user and grant passwordless sudo in builder +RUN groupadd -r gpadmin && \ + useradd -m -r -g gpadmin -s /bin/bash gpadmin && \ + echo "gpadmin ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/gpadmin && \ + chmod 440 /etc/sudoers.d/gpadmin + +# Switch to gpadmin user +USER gpadmin +WORKDIR /home/gpadmin + +# Release version to build (Apache official tarball) +ARG CB_RELEASE_VERSION=2.0.0-incubating + +# Download and extract the specified release version from Apache +RUN wget -nv "https://downloads.apache.org/incubator/cloudberry/${CB_RELEASE_VERSION}/apache-cloudberry-${CB_RELEASE_VERSION}-src.tar.gz" -O /home/gpadmin/apache-cloudberry-${CB_RELEASE_VERSION}-src.tar.gz && \ + tar -xzf /home/gpadmin/apache-cloudberry-${CB_RELEASE_VERSION}-src.tar.gz -C /home/gpadmin && \ + rm -f /home/gpadmin/apache-cloudberry-${CB_RELEASE_VERSION}-src.tar.gz && \ + mv /home/gpadmin/apache-cloudberry-${CB_RELEASE_VERSION} /home/gpadmin/cloudberry + +# Build Cloudberry using the official build scripts +RUN cd /home/gpadmin/cloudberry && \ + export SRC_DIR=/home/gpadmin/cloudberry && \ + mkdir -p "${SRC_DIR}/build-logs" && \ + # Ensure Cloudberry lib dir exists and has Xerces libs available + sudo rm -rf /usr/local/cloudberry-db && \ + sudo mkdir -p /usr/local/cloudberry-db/lib && \ + sudo cp -v /usr/local/xerces-c/lib/libxerces-c.so \ + /usr/local/xerces-c/lib/libxerces-c-3.*.so \ + /usr/local/cloudberry-db/lib/ && \ + sudo chown -R gpadmin:gpadmin /usr/local/cloudberry-db && \ + # Configure with required features and paths + export LD_LIBRARY_PATH=/usr/local/cloudberry-db/lib:$LD_LIBRARY_PATH && \ + ./configure --prefix=/usr/local/cloudberry-db \ + --disable-external-fts \ + --enable-debug \ + --enable-cassert \ + --enable-debug-extensions \ + --enable-gpcloud \ + --enable-ic-proxy \ + --enable-mapreduce \ + --enable-orafce \ + --enable-orca \ + --enable-pax \ + --enable-pxf \ + --enable-tap-tests \ + --with-gssapi \ + --with-ldap \ + --with-libxml \ + --with-lz4 \ + --with-pam \ + --with-perl \ + --with-pgport=5432 \ + --with-python \ + --with-pythonsrc-ext \ + --with-ssl=openssl \ + --with-uuid=e2fs \ + --with-includes=/usr/local/xerces-c/include \ + --with-libraries=/usr/local/cloudberry-db/lib && \ + # Build and install + make -j$(nproc) --directory ${SRC_DIR} && \ + make -j$(nproc) --directory ${SRC_DIR}/contrib && \ + make install --directory ${SRC_DIR} && \ + make install --directory "${SRC_DIR}/contrib" + +# -------------------------------------------------------------------- +# Runtime stage: Rocky Linux 9 runtime with required dependencies +# -------------------------------------------------------------------- +FROM rockylinux/rockylinux:9.6 + +# Install required runtime dependencies, SSH server, sudo, and tools +# Note: Use dnf on Rocky Linux 9 +RUN dnf -y update && \ + dnf -y install \ + openssh-server openssh-clients \ + sudo shadow-utils \ + bash procps-ng \ + ca-certificates \ + python3 \ + apr \ + bzip2-libs \ + krb5-libs \ + libevent \ + libicu \ + libuuid \ + libxml2 \ + libyaml \ + libzstd \ + lz4 \ + ncurses \ + openldap \ + openssl \ + pam \ + pcre2 \ + perl \ + protobuf \ + readline \ + zlib \ + glibc-langpack-en \ + libuv \ + iproute \ + net-tools \ + which \ + rsync \ + keyutils \ + libstdc++ && \ + dnf clean all && rm -rf /var/cache/dnf + +# Create gpadmin user and group, grant passwordless sudo +RUN groupadd -r gpadmin && \ + useradd -m -r -g gpadmin -s /bin/bash gpadmin && \ + echo "gpadmin ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/gpadmin && \ + chmod 440 /etc/sudoers.d/gpadmin + +# Prepare SSH daemon: generate host keys, ensure runtime dir, and allow gpadmin to start it +RUN ssh-keygen -A && mkdir -p /run/sshd && chmod u+s /usr/sbin/sshd + +# Copy built Cloudberry from builder stage +COPY --from=builder /usr/local/cloudberry-db /usr/local/cloudberry-db + +# Bring Xerces libs into Cloudberry lib dir +COPY --from=builder /usr/local/xerces-c/lib/libxerces-c.so /usr/local/cloudberry-db/lib/ +COPY --from=builder /usr/local/xerces-c/lib/libxerces-c-3.*.so /usr/local/cloudberry-db/lib/ + +# Copy configuration files to their final destinations +COPY ./configs/90-cbdb-limits.conf /etc/security/limits.d/90-cbdb-limits.conf +COPY ./configs/90-cbdb-sysctl.conf /etc/sysctl.d/90-cbdb-sysctl.conf +COPY ./configs/gpinitsystem_singlenode /tmp/gpinitsystem_singlenode +COPY ./configs/gpinitsystem_multinode /tmp/gpinitsystem_multinode +COPY ./configs/multinode-gpinit-hosts /tmp/multinode-gpinit-hosts +COPY ./configs/init_system.sh /tmp/init_system.sh + +# Runtime configuration +RUN echo "cdw" > /tmp/gpdb-hosts && \ + chmod 755 /tmp/gpinitsystem_singlenode && \ + chmod 755 /tmp/gpinitsystem_multinode && \ + chmod 755 /tmp/init_system.sh && \ + mkdir -p /data0/database/coordinator /data0/database/primary /data0/database/mirror && \ + chown -R gpadmin:gpadmin \ + /usr/local/cloudberry-db \ + /tmp/gpinitsystem_singlenode \ + /tmp/gpinitsystem_multinode \ + /tmp/gpdb-hosts \ + /tmp/multinode-gpinit-hosts \ + /data0 && \ + echo "export COORDINATOR_DATA_DIRECTORY=/data0/database/coordinator/gpseg-1" >> /home/gpadmin/.bashrc && \ + echo -e '\n# Add Cloudberry entries\nif [ -f /usr/local/cloudberry-db/cloudberry-env.sh ]; then\n source /usr/local/cloudberry-db/cloudberry-env.sh\nfi\n# Add Greenplum compatibility entries\nif [ -f /usr/local/cloudberry-db/greenplum_path.sh ]; then\n source /usr/local/cloudberry-db/greenplum_path.sh\nfi' >> /home/gpadmin/.bashrc + +# ---------------------------------------------------------------------- +# Generate SSH keypair for gpadmin user at build time +# ---------------------------------------------------------------------- +# WARNING: This embeds a fixed SSH keypair in the Docker image for +# sandbox convenience. This is ONLY suitable for local testing and +# development. DO NOT use this image in production or any environment +# where security is a concern. +# ---------------------------------------------------------------------- +RUN mkdir -p /home/gpadmin/.ssh && \ + ssh-keygen -t rsa -b 4096 -N '' -C 'gpadmin@cloudberry-sandbox' \ + -f /home/gpadmin/.ssh/id_rsa && \ + cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys && \ + chmod 700 /home/gpadmin/.ssh && \ + chmod 600 /home/gpadmin/.ssh/id_rsa && \ + chmod 644 /home/gpadmin/.ssh/id_rsa.pub && \ + chmod 600 /home/gpadmin/.ssh/authorized_keys && \ + chown -R gpadmin:gpadmin /home/gpadmin/.ssh + +# Set default user and working directory +USER gpadmin +WORKDIR /home/gpadmin + +EXPOSE 5432 22 + +# cgroup mount (provided by compose/run) +VOLUME [ "/sys/fs/cgroup" ] + +# Start the container by running the initialization script +CMD ["bash","-c","/tmp/init_system.sh"] diff --git a/devops/sandbox/Dockerfile.main.rockylinux9 b/devops/sandbox/Dockerfile.main.rockylinux9 new file mode 100644 index 00000000000..03619cda4ba --- /dev/null +++ b/devops/sandbox/Dockerfile.main.rockylinux9 @@ -0,0 +1,262 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Multi-stage Dockerfile for Apache Cloudberry Sandbox Environment +# -------------------------------------------------------------------- +# This Dockerfile supports two source modes: +# - Main branch: clone the latest main from GitHub (CODEBASE_VERSION=main) +# - Local source: use the repository contents from the Docker build context +# (CODEBASE_VERSION=local), recommended for developers working from +# their current checkout. +# In both modes it compiles and installs Cloudberry, then creates a +# runtime environment for testing and development. +# -------------------------------------------------------------------- + +# Build stage: Use pre-built image to compile Cloudberry +ARG CODEBASE_VERSION=main +FROM rockylinux/rockylinux:9.6 AS builder +ARG CODEBASE_VERSION + +# Install build toolchains and development headers (avoid coreutils/curl conflicts on arm64) +RUN dnf makecache && \ + dnf install -y \ + epel-release \ + git && \ + dnf config-manager --disable epel-cisco-openh264 && \ + dnf makecache && \ + dnf config-manager --disable epel && \ + dnf install -y --enablerepo=epel \ + the_silver_searcher \ + bat \ + htop && \ + dnf install -y \ + bison \ + cmake3 \ + ed \ + file \ + flex \ + gcc \ + gcc-c++ \ + gdb \ + glibc-langpack-en \ + glibc-locale-source \ + initscripts \ + iproute \ + less \ + lsof \ + m4 \ + net-tools \ + openssh-clients \ + openssh-server \ + perl \ + rpm-build \ + rpmdevtools \ + rsync \ + sudo \ + tar \ + unzip \ + util-linux-ng \ + wget \ + sshpass \ + which && \ + dnf install -y \ + apr-devel \ + bzip2-devel \ + java-11-openjdk \ + java-11-openjdk-devel \ + krb5-devel \ + libcurl-devel \ + libevent-devel \ + libxml2-devel \ + libuuid-devel \ + libzstd-devel \ + lz4 \ + lz4-devel \ + openldap-devel \ + openssl-devel \ + pam-devel \ + perl-ExtUtils-Embed \ + perl-Test-Simple \ + perl-core \ + python3-devel \ + python3-pytest \ + readline-devel \ + zlib-devel && \ + dnf install -y --enablerepo=crb \ + liburing-devel \ + libuv-devel \ + libyaml-devel \ + perl-IPC-Run \ + protobuf-devel && \ + dnf clean all && \ + cd && XERCES_LATEST_RELEASE=3.3.0 && \ + wget -nv "https://archive.apache.org/dist/xerces/c/3/sources/xerces-c-${XERCES_LATEST_RELEASE}.tar.gz" && \ + echo "$(curl -sL https://archive.apache.org/dist/xerces/c/3/sources/xerces-c-${XERCES_LATEST_RELEASE}.tar.gz.sha256)" | sha256sum -c - && \ + tar xf "xerces-c-${XERCES_LATEST_RELEASE}.tar.gz"; rm "xerces-c-${XERCES_LATEST_RELEASE}.tar.gz" && \ + cd xerces-c-${XERCES_LATEST_RELEASE} && \ + ./configure --prefix=/usr/local/xerces-c && \ + make -j$(nproc) && \ + make install -C ~/xerces-c-${XERCES_LATEST_RELEASE} && \ + rm -rf ~/xerces-c* + +# Create gpadmin user and grant passwordless sudo in builder +RUN groupadd -r gpadmin && \ + useradd -m -r -g gpadmin -s /bin/bash gpadmin && \ + echo "gpadmin ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/gpadmin && \ + chmod 440 /etc/sudoers.d/gpadmin + +# Switch to gpadmin user +USER gpadmin +WORKDIR /home/gpadmin + +# Copy repository contents from build context +# Note: This COPY always executes regardless of CODEBASE_VERSION due to Docker +# layer caching behavior. For main branch builds, the copied content will be +# removed and replaced by a fresh git clone in the RUN step below. This is an +# acceptable tradeoff to keep the Dockerfile simple and maintainable. +COPY --chown=gpadmin:gpadmin . /home/gpadmin/cloudberry + +# Obtain Cloudberry source code based on build mode +RUN if [ "${CODEBASE_VERSION}" = "local" ]; then \ + echo "Using local source from build context"; \ + else \ + rm -rf /home/gpadmin/cloudberry && \ + git clone --recurse-submodules --branch main --single-branch --depth=1 https://github.com/apache/cloudberry.git; \ + fi + +# Build Cloudberry using the official build scripts +RUN cd /home/gpadmin/cloudberry && \ + export SRC_DIR=/home/gpadmin/cloudberry && \ + mkdir -p ${SRC_DIR}/build-logs && \ + ./devops/build/automation/cloudberry/scripts/configure-cloudberry.sh && \ + ./devops/build/automation/cloudberry/scripts/build-cloudberry.sh + +# -------------------------------------------------------------------- +# Runtime stage: Switch to a slimmer base image (Rocky Linux 9) +# -------------------------------------------------------------------- +FROM rockylinux/rockylinux:9.6 + +# Install required runtime dependencies, SSH server, sudo, and tools +# Note: Use dnf on Rocky Linux 9 +RUN dnf -y update && \ + dnf -y install \ + openssh-server openssh-clients \ + sudo shadow-utils \ + bash procps-ng \ + ca-certificates \ + python3 \ + apr \ + bzip2-libs \ + krb5-libs \ + libevent \ + libicu \ + liburing \ + libuuid \ + libxml2 \ + libyaml \ + libzstd \ + lz4 \ + ncurses \ + openldap \ + openssl \ + pam \ + pcre2 \ + perl \ + protobuf \ + readline \ + zlib \ + glibc-langpack-en \ + libuv \ + iproute \ + net-tools \ + which \ + rsync \ + keyutils \ + libstdc++ && \ + dnf clean all && rm -rf /var/cache/dnf + +# Create gpadmin user and group, grant passwordless sudo +RUN groupadd -r gpadmin && \ + useradd -m -r -g gpadmin -s /bin/bash gpadmin && \ + echo "gpadmin ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/gpadmin && \ + chmod 440 /etc/sudoers.d/gpadmin + +# Prepare SSH daemon: generate host keys, ensure runtime dir, and allow gpadmin to start it +RUN ssh-keygen -A && mkdir -p /run/sshd && chmod u+s /usr/sbin/sshd + +# Copy built Cloudberry from builder stage +COPY --from=builder /usr/local/cloudberry-db /usr/local/cloudberry-db + +# Bring Xerces libs into Cloudberry lib dir and normalize SONAME via builder-installed versioned prefix +COPY --from=builder /usr/local/xerces-c/lib/libxerces-c.so /usr/local/cloudberry-db/lib/ +COPY --from=builder /usr/local/xerces-c/lib/libxerces-c-3.*.so /usr/local/cloudberry-db/lib/ + +# Copy configuration files to their final destinations +COPY devops/sandbox/configs/90-cbdb-limits.conf /etc/security/limits.d/90-cbdb-limits.conf +COPY devops/sandbox/configs/90-cbdb-sysctl.conf /etc/sysctl.d/90-cbdb-sysctl.conf +COPY devops/sandbox/configs/gpinitsystem_singlenode /tmp/gpinitsystem_singlenode +COPY devops/sandbox/configs/gpinitsystem_multinode /tmp/gpinitsystem_multinode +COPY devops/sandbox/configs/multinode-gpinit-hosts /tmp/multinode-gpinit-hosts +COPY devops/sandbox/configs/init_system.sh /tmp/init_system.sh + +# Runtime configuration +RUN echo "cdw" > /tmp/gpdb-hosts && \ + chmod 755 /tmp/gpinitsystem_singlenode && \ + chmod 755 /tmp/gpinitsystem_multinode && \ + chmod 755 /tmp/init_system.sh && \ + mkdir -p /data0/database/coordinator /data0/database/primary /data0/database/mirror && \ + chown -R gpadmin:gpadmin \ + /usr/local/cloudberry-db \ + /tmp/gpinitsystem_singlenode \ + /tmp/gpinitsystem_multinode \ + /tmp/gpdb-hosts \ + /tmp/multinode-gpinit-hosts \ + /data0 && \ + echo "export COORDINATOR_DATA_DIRECTORY=/data0/database/coordinator/gpseg-1" >> /home/gpadmin/.bashrc && \ + echo -e '\n# Add Cloudberry entries\nif [ -f /usr/local/cloudberry-db/cloudberry-env.sh ]; then\n source /usr/local/cloudberry-db/cloudberry-env.sh\nfi' >> /home/gpadmin/.bashrc + +# ---------------------------------------------------------------------- +# Generate SSH keypair for gpadmin user at build time +# ---------------------------------------------------------------------- +# WARNING: This embeds a fixed SSH keypair in the Docker image for +# sandbox convenience. This is ONLY suitable for local testing and +# development. DO NOT use this image in production or any environment +# where security is a concern. +# ---------------------------------------------------------------------- +RUN mkdir -p /home/gpadmin/.ssh && \ + ssh-keygen -t rsa -b 4096 -N '' -C 'gpadmin@cloudberry-sandbox' \ + -f /home/gpadmin/.ssh/id_rsa && \ + cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys && \ + chmod 700 /home/gpadmin/.ssh && \ + chmod 600 /home/gpadmin/.ssh/id_rsa && \ + chmod 644 /home/gpadmin/.ssh/id_rsa.pub && \ + chmod 600 /home/gpadmin/.ssh/authorized_keys && \ + chown -R gpadmin:gpadmin /home/gpadmin/.ssh + +# Set default user and working directory +USER gpadmin +WORKDIR /home/gpadmin + +EXPOSE 5432 22 + +# cgroup mount (provided by compose/run) +VOLUME [ "/sys/fs/cgroup" ] + +# Start the container by running the initialization script +CMD ["bash","-c","/tmp/init_system.sh"] diff --git a/devops/sandbox/README.md b/devops/sandbox/README.md new file mode 100644 index 00000000000..9f475977835 --- /dev/null +++ b/devops/sandbox/README.md @@ -0,0 +1,232 @@ + +--- +title: Sandbox of Apache Cloudberry +--- + +# Install Apache Cloudberry With Docker + +This document guides you on how to quickly set up and connect to Apache Cloudberry in a Docker environment. You can try out Apache Cloudberry by performing some basic operations and running SQL commands. + +> [!WARNING] +> This guide is intended for testing or development. DO NOT use it for production. + +> [!WARNING] +> **Security Notice: Embedded SSH Keys** +> +> For ease of use, this sandbox environment includes **pre-generated SSH keys embedded in the Docker image**. All containers built from the same image share the same SSH keypair, allowing passwordless SSH communication between nodes. +> +> **This is ONLY acceptable for local testing and development environments.** These embedded keys are **NOT secure** and must **NEVER** be used in production or any environment where security is a concern. Anyone with access to the Docker image can extract these keys. + + +## Prerequisites + +Make sure that your environment meets the following requirements: + +- Platform requirement: Any platform with Docker runtime. For details, refer to [Get Started with Docker](https://www.docker.com/get-started/). +- Other dependencies: Git, SSH, and internet connection + +## Build the Sandbox + +When building and deploying Apache Cloudberry in Docker, you will have 2 different deployment options as well as different build options. + +**Deployment Options** +1. **Single Container** (Default) - With the single container option, you will have the coordinator as well as the Apache Cloudberry segments all running on a single container. This is the default behavior when deploying using the `run.sh` script provided. +2. **Multi-Container** - Deploying with the multi-container option will give you a more realistic deployment of what actual production Apache Cloudberry clusters look like. With multi-node, you will have the coordinator, the standby coordinator, and 2 segment hosts all on their own respective containers. This is to both highlight the distributed nature of Apache Cloudberry as well as highlight how high availability (HA) features work in the event of a server (or in this case a container) failing. This is enabled by passing the -m flag to the `run.sh` script which will be highlighted below. + +![cloudberry Sandbox Deployments](./sandbox-deployment.jpg) + +**Build Options** + +1. **Recommended for most users** – Build directly from your current local source code using `-c local`. This is the fastest way to get started as it reuses your existing checkout, avoiding the need to download the code again inside the container. It is also ideal for developers testing local changes. +2. Compile with the source code of the latest Apache Cloudberry (released in [Apache Cloudberry Release Page](https://github.com/apache/cloudberry/releases)). The base OS will be Rocky Linux 9 Docker image. +3. Compile with the latest Apache Cloudberry [main](https://github.com/apache/cloudberry/tree/main) branch. The base OS will be Rocky Linux 9 Docker image. + +Build and deploy steps: + +1. Start Docker Desktop and make sure it is running properly on your host platform. + +2. Clone the Apache Cloudberry repository to the target machine. + + ```shell + git clone https://github.com/apache/cloudberry.git + ``` + +3. Enter the repository and run the `run.sh` script to start the Docker container. This will start the automatic installation process. Depending on your environment, you may need to run this with 'sudo' command. + + - **Recommended: Build from your current local source code (single container)** + + This is the most efficient option for both new users and developers. It uses your local checkout directly, saving time by skipping the code download step inside the container. It also allows you to immediately test any local code modifications. + + ```shell + cd cloudberry/devops/sandbox + ./run.sh -c local + ``` + + - **Recommended: Build from your current local source code (multi-container)** + + Same as above, but deploys a multi-container cluster. Ideal for testing distributed features or high availability with your local code. + + ```shell + cd cloudberry/devops/sandbox + ./run.sh -c local -m + ``` + + - For latest Apache Cloudberry release running on a single container + + ```shell + cd cloudberry/devops/sandbox + ./run.sh -c 2.0.0 + ``` + + - For latest Apache Cloudberry release running across multiple containers + + ```shell + cd cloudberry/devops/sandbox + ./run.sh -c 2.0.0 -m + ``` + + - For latest main branch running on a single container + + ```shell + cd cloudberry/devops/sandbox + ./run.sh -c main + ``` + + - For latest main branch running across multiple containers + + ```shell + cd cloudberry/devops/sandbox + ./run.sh -c main -m + ``` + + Once the script finishes without error, the sandbox is built and running successfully. The `docker run` and `docker compose` commands use the --detach option allowing you to ssh or access the running Apache Cloudberry instance remotely. + + Please review run.sh script for additional options (e.g. setting Timezone in running container, only building container). You can also execute `./run.sh -h` to see the usage. + +## Connect to the database + +> [!NOTE] +> When deploying the multi-container Apache Cloudberry environment it may take extra time for the database to initialize, so you may need to wait a few minutes before you can execute the psql prompt successfully. You can run `docker logs cbdb-cdw -f` to see the current state of the database initialization process, you'll know the process is finished when you see the "Deployment Successful" output. + +You can now connect to the database and try some basic operations. + +1. Connect to the Docker container from the host machine: + + ```shell + docker exec -it cbdb-cdw /bin/bash + ``` + + If it is successful, you will see the following prompt: + + ```shell + [gpadmin@cdw /]$ + ``` + +2. Log into Apache Cloudberry in Docker. See the following commands and example outputs: + + ```shell + [gpadmin@cdw ~]$ psql # Connects to the database with the default database name "gpadmin". + + # psql (14.4, server 14.4) + # Type "help" for help. + # Note: No password is required for the gpadmin user in this sandbox environment. + ``` + + ```sql + gpadmin=# SELECT VERSION(); -- Checks the database version. + + PostgreSQL 14.4 (Apache Cloudberry 1.0.0 build dev) on aarch64-unknown-linux-gnu, compiled by gcc (GCC) 10.2.1 20210130 (Red Hat 10.2.1-11), 64-bit compiled on Oct 24 2023 10:24:28 + (1 row) + ``` + +3. Alternatively, you can connect to the database directly from your host machine without entering the container: + + The Apache Cloudberry coordinator port (default 5432) is mapped to port **15432** on your host machine. You can use the `psql` client on your host to connect directly: + + ```shell + # Connect from host machine + psql -h localhost -p 15432 -d postgres -U gpadmin + ``` + + > [!NOTE] + > - No password is required for the `gpadmin` user in this sandbox environment. + > - Make sure you have PostgreSQL client (`psql`) installed on your host machine. + > - The port mapping is: Container port `5432` → Host port `15432` + +Now you have an Apache Cloudberry and can continue with [Apache Cloudberry Tutorials](https://cloudberry.apache.org/docs/)! Enjoy! + +## Working with your Apache Cloudberry Docker environment + +When working with the Apache Cloudberry Docker environment there are a few commands that will be useful to you. + +**Stopping Your Single Container Deployment With Docker** + +To stop the **single container** deployment while _keeping the data and state_ within the container, you can run the command below. This means that you can later start the container again and any changes you made to the containers will be persisted between runs. + +```shell +docker stop cbdb-cdw +``` + +To stop the **single container** deployment and also remove the volume that belongs to the container, you can run the following command. Keep in mind this will remove the volume as well as the container associated which means any changes you've made inside of the container or any database state will be wiped and unrecoverable. + +```shell +docker rm -f cbdb-cdw +``` + +**Stopping Your Multi-Container Deployment With Docker** + +To stop the **multi-container** deployment while _keeping the data and state_ within the container, you can run the command below. This means that you can later start the container again and any changes you made to the containers will be persisted between runs. + +```shell +docker compose -f docker-compose-rockylinux9.yml stop +``` + +To stop the **multi-container** deployment and also remove the network and volumes that belong to the containers, you can run the command below. Running this command means it will delete the containers as well as remove the volumes that the containers are associated with. This means any changes you've made inside of the containers or any database state will be wiped and unrecoverable. + +```shell +docker compose -f docker-compose-rockylinux9.yml down -v +``` + +**Starting A Stopped Single Container Apache Cloudberry Docker Deployment** + +If you've run any of the commands above that keep the Docker volumes persisted between shutting the containers down, you can use the following commands to bring that same deployment back up with it's previous state. + +To start a **single container** deployment after it was shut down, you can simply run the following + +```shell +docker start cbdb-cdw +``` + +**Starting A Stopped Multi-Container Apache Cloudberry Docker Deployment** + +To start a **multi-container** deployment after it was shut down, you can run the following command. + +```shell +docker compose -f docker-compose-rockylinux9.yml start +``` + +> [!NOTE] +> When starting a previously stopped Apache Cloudberry Docker environment, you'll need to manually start the database back up. To do this, just run the following commands once the container(s) are back up and running. The `gpstart` command is used for starting the database, and -a is a flag saying to start the database without prompting (non-interactive). + +```shell +docker exec -it cbdb-cdw /bin/bash + +[gpadmin@cdw /] gpstart -a +``` \ No newline at end of file diff --git a/devops/sandbox/configs/90-cbdb-limits.conf b/devops/sandbox/configs/90-cbdb-limits.conf new file mode 100644 index 00000000000..33088f92278 --- /dev/null +++ b/devops/sandbox/configs/90-cbdb-limits.conf @@ -0,0 +1,29 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +###################### +# CBDB CONFIG PARAMS # +###################### + + * soft core unlimited + * hard core unlimited + * soft nofile 524288 + * hard nofile 524288 + * soft nproc 131072 + * hard nproc 131072 diff --git a/devops/sandbox/configs/90-cbdb-sysctl.conf b/devops/sandbox/configs/90-cbdb-sysctl.conf new file mode 100644 index 00000000000..9f0b7c576e4 --- /dev/null +++ b/devops/sandbox/configs/90-cbdb-sysctl.conf @@ -0,0 +1,54 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +###################### +# CBDB CONFIG PARAMS # +###################### + +kernel.shmmax = 1000000000 +kernel.shmall = 4000000000 +kernel.shmmni = 4096 +vm.overcommit_memory = 2 +vm.overcommit_ratio = 95 +net.ipv4.ip_local_port_range = 10000 65535 +kernel.sem = 250 2048000 200 8192 +kernel.sysrq = 1 +kernel.core_uses_pid = 1 +kernel.msgmnb = 65536 +kernel.msgmax = 65536 +kernel.msgmni = 2048 +net.ipv4.tcp_syncookies = 1 +net.ipv4.conf.default.accept_source_route = 0 +net.ipv4.tcp_max_syn_backlog = 4096 +net.ipv4.conf.all.arp_filter = 1 +net.ipv4.ipfrag_high_thresh = 41943040 +net.ipv4.ipfrag_low_thresh = 31457280 +net.ipv4.ipfrag_time = 60 +net.core.netdev_max_backlog = 10000 +net.core.rmem_max = 2097152 +net.core.wmem_max = 2097152 +vm.swappiness = 10 +vm.zone_reclaim_mode = 0 +vm.dirty_expire_centisecs = 500 +vm.dirty_writeback_centisecs = 100 +vm.dirty_background_ratio = 0 +vm.dirty_ratio = 0 +vm.dirty_background_bytes = 1610612736 +vm.dirty_bytes = 4294967296 +kernel.core_pattern=/var/core/core.%h.%t diff --git a/devops/sandbox/configs/gpinitsystem_multinode b/devops/sandbox/configs/gpinitsystem_multinode new file mode 100644 index 00000000000..d6a46d4d410 --- /dev/null +++ b/devops/sandbox/configs/gpinitsystem_multinode @@ -0,0 +1,138 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# FILE NAME: gpinitsystem_multinode + +# A configuration file is needed by the gpinitsystem utility. +# This sample file initializes an Apache Cloudberry multi-node cluster +# with one coordinator, one standby coordinator, and segment instances +# across multiple hosts. This file is referenced when you run gpinitsystem. + +################################################ +# REQUIRED PARAMETERS +################################################ + +# A name for the array you are configuring. You can use any name you +# like. Enclose the name in quotes if the name contains spaces. + +ARRAY_NAME="Sandbox: Apache Cloudberry Cluster" + +# This specifies the file that contains the list of segment host names +# that comprise the Greenplum system. For a single-node system, this +# file contains the local OS-configured hostname (as output by the +# hostname command). If the file does not reside in the same +# directory where the gpinitsystem utility is executed, specify +# the absolute path to the file. + +MACHINE_LIST_FILE=/tmp/gpdb-hosts + +# This specifies a prefix that will be used to name the data directories +# of the coordinator and segment instances. The naming convention for data +# directories in a Apache Cloudberry system is SEG_PREFIX +# where starts with 0 for segment instances and the coordinator +# is always -1. So for example, if you choose the prefix gpsne, your +# coordinator instance data directory would be named gpsne-1, and the segment +# instances would be named gpsne0, gpsne1, gpsne2, gpsne3, and so on. + +SEG_PREFIX=gpseg + +# Base port number on which primary segment instances will be +# started on a segment host. The base port number will be +# incremented by one for each segment instance started on a host. + +PORT_BASE=40000 + +# This specifies the data storage location(s) where the script will +# create the primary segment data directories. The script creates a +# unique data directory for each segment instance. If you want multiple +# segment instances per host, list a data storage area for each primary +# segment you want created. The recommended number is one primary segment +# per CPU. It is OK to list the same data storage area multiple times +# if you want your data directories created in the same location. The +# number of data directory locations specified will determine the number +# of primary segment instances created per host. +# You must make sure that the user who runs gpinitsystem (for example, +# the gpadmin user) has permissions to write to these directories. You +# may want to create these directories on the segment hosts before running +# gpinitsystem and chown them to the appropriate user. + +declare -a DATA_DIRECTORY=(/data0/database/primary \ + /data0/database/primary) + +# The OS-configured hostname of the Apache Cloudberry coordinator instance. + +COORDINATOR_HOSTNAME=cdw + +# The location where the data directory will be created on the +# Greenplum coordinator host. +# You must make sure that the user who runs gpinitsystem +# has permissions to write to this directory. You may want to +# create this directory on the coordinator host before running +# gpinitsystem and chown it to the appropriate user. + +COORDINATOR_DIRECTORY=/data0/database/coordinator + +# The port number for the coordinator instance. This is the port number +# that users and client connections will use when accessing the +# Apache Cloudberry system. + +COORDINATOR_PORT=5432 + +# The shell the gpinitsystem script uses to execute +# commands on remote hosts. Allowed value is ssh. You must set up +# your trusted host environment before running the gpinitsystem +# script. You can use gpssh-exkeys to do this. + +TRUSTED_SHELL=ssh + +# Maximum distance between automatic write ahead log (WAL) +# checkpoints, in log file segments (each segment is normally 16 +# megabytes). This will set the checkpoint_segments parameter +# in the postgresql.conf file for each segment instance in the +# Apache Cloudberry system. + +CHECK_POINT_SEGMENTS=8 + +# The character set encoding to use. Greenplum supports the +# same character sets as PostgreSQL. See 'Character Set Support' +# in the PostgreSQL documentation for allowed character sets. +# Should correspond to the OS locale specified with the +# gpinitsystem -n option. + +ENCODING=UNICODE + +################################################ +# OPTIONAL PARAMETERS +################################################ + +# Optional. Uncomment to create a database of this name after the +# system is initialized. You can always create a database later using +# the CREATE DATABASE command or the createdb script. + +DATABASE_NAME=gpadmin + +# Mirror configuration + +MIRROR_PORT_BASE=50000 + +declare -a MIRROR_DATA_DIRECTORY=(/data0/database/mirror \ + /data0/database/mirror) + +# REPLICATION_PORT_BASE=41000 +# MIRROR_REPLICATION_PORT_BASE=51000 diff --git a/devops/sandbox/configs/gpinitsystem_singlenode b/devops/sandbox/configs/gpinitsystem_singlenode new file mode 100644 index 00000000000..f221d81938c --- /dev/null +++ b/devops/sandbox/configs/gpinitsystem_singlenode @@ -0,0 +1,140 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# FILE NAME: gpinitsystem_singlenode + +# A configuration file is needed by the gpinitsystem utility. +# This sample file initializes a Apache Cloudberry Single Node +# Edition (SNE) system with one coordinator and two segment instances +# on the local host. This file is referenced when you run gpinitsystem. + +################################################ +# REQUIRED PARAMETERS +################################################ + +# A name for the array you are configuring. You can use any name you +# like. Enclose the name in quotes if the name contains spaces. + +ARRAY_NAME="Sandbox: Apache Cloudberry Cluster" + +# This specifies the file that contains the list of segment host names +# that comprise the Greenplum system. For a single-node system, this +# file contains the local OS-configured hostname (as output by the +# hostname command). If the file does not reside in the same +# directory where the gpinitsystem utility is executed, specify +# the absolute path to the file. + +MACHINE_LIST_FILE=/tmp/gpdb-hosts + +# This specifies a prefix that will be used to name the data directories +# of the coordinator and segment instances. The naming convention for data +# directories in a Apache Cloudberry system is SEG_PREFIX +# where starts with 0 for segment instances and the coordinator +# is always -1. So for example, if you choose the prefix gpsne, your +# coordinator instance data directory would be named gpsne-1, and the segment +# instances would be named gpsne0, gpsne1, gpsne2, gpsne3, and so on. + +SEG_PREFIX=gpseg + +# Base port number on which primary segment instances will be +# started on a segment host. The base port number will be +# incremented by one for each segment instance started on a host. + +PORT_BASE=40000 + +# This specifies the data storage location(s) where the script will +# create the primary segment data directories. The script creates a +# unique data directory for each segment instance. If you want multiple +# segment instances per host, list a data storage area for each primary +# segment you want created. The recommended number is one primary segment +# per CPU. It is OK to list the same data storage area multiple times +# if you want your data directories created in the same location. The +# number of data directory locations specified will determine the number +# of primary segment instances created per host. +# You must make sure that the user who runs gpinitsystem (for example, +# the gpadmin user) has permissions to write to these directories. You +# may want to create these directories on the segment hosts before running +# gpinitsystem and chown them to the appropriate user. + +declare -a DATA_DIRECTORY=(/data0/database/primary \ + /data0/database/primary \ + /data0/database/primary) + +# The OS-configured hostname of the Apache Cloudberry coordinator instance. + +COORDINATOR_HOSTNAME=cdw + +# The location where the data directory will be created on the +# Greenplum coordinator host. +# You must make sure that the user who runs gpinitsystem +# has permissions to write to this directory. You may want to +# create this directory on the coordinator host before running +# gpinitsystem and chown it to the appropriate user. + +COORDINATOR_DIRECTORY=/data0/database/coordinator + +# The port number for the coordinator instance. This is the port number +# that users and client connections will use when accessing the +# Apache Cloudberry system. + +COORDINATOR_PORT=5432 + +# The shell the gpinitsystem script uses to execute +# commands on remote hosts. Allowed value is ssh. You must set up +# your trusted host environment before running the gpinitsystem +# script. You can use gpssh-exkeys to do this. + +TRUSTED_SHELL=ssh + +# Maximum distance between automatic write ahead log (WAL) +# checkpoints, in log file segments (each segment is normally 16 +# megabytes). This will set the checkpoint_segments parameter +# in the postgresql.conf file for each segment instance in the +# Apache Cloudberry system. + +CHECK_POINT_SEGMENTS=8 + +# The character set encoding to use. Greenplum supports the +# same character sets as PostgreSQL. See 'Character Set Support' +# in the PostgreSQL documentation for allowed character sets. +# Should correspond to the OS locale specified with the +# gpinitsystem -n option. + +ENCODING=UNICODE + +################################################ +# OPTIONAL PARAMETERS +################################################ + +# Optional. Uncomment to create a database of this name after the +# system is initialized. You can always create a database later using +# the CREATE DATABASE command or the createdb script. + +DATABASE_NAME=gpadmin + +# Mirror configuration + +MIRROR_PORT_BASE=50000 + +declare -a MIRROR_DATA_DIRECTORY=(/data0/database/mirror \ + /data0/database/mirror \ + /data0/database/mirror) + +# REPLICATION_PORT_BASE=41000 +# MIRROR_REPLICATION_PORT_BASE=51000 diff --git a/devops/sandbox/configs/init_system.sh b/devops/sandbox/configs/init_system.sh new file mode 100755 index 00000000000..455341ff46c --- /dev/null +++ b/devops/sandbox/configs/init_system.sh @@ -0,0 +1,190 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +## ====================================================================== +## Container initialization script for Apache Cloudberry Sandbox +## ====================================================================== + +# ---------------------------------------------------------------------- +# Start SSH daemon and setup for SSH access +# ---------------------------------------------------------------------- +# The SSH daemon is started to allow remote access to the container via +# SSH. This is useful for development and debugging purposes. +# ---------------------------------------------------------------------- + +# Ensure SSH directory exists (created at build time; ignore errors if any) +mkdir -p /run/sshd 2>/dev/null || true + +# Start SSH daemon directly (binary is setuid-root in the image) +if ! /usr/sbin/sshd; then + echo "Failed to start SSH daemon" >&2 + exit 1 +fi + +# Give SSH daemon time to start +sleep 5 + +# ---------------------------------------------------------------------- +# Remove /run/nologin to allow logins +# ---------------------------------------------------------------------- +# The /run/nologin file, if present, prevents users from logging into +# the system. This file is removed to ensure that users can log in via SSH. +# ---------------------------------------------------------------------- +rm -f /run/nologin 2>/dev/null || true + +# ---------------------------------------------------------------------- +# Configure passwordless SSH access for 'gpadmin' user +# ---------------------------------------------------------------------- +# SSH keys are already generated and configured in the Docker image at +# build time. All containers from the same image share the same keypair, +# which allows passwordless SSH between containers. +# +# This is ONLY suitable for sandbox/testing environments. +# DO NOT use this approach in production. +# ---------------------------------------------------------------------- + +# Verify SSH keys exist (they should be in the image already) +if [ ! -f /home/gpadmin/.ssh/id_rsa ]; then + echo "ERROR: SSH keys not found in image. This should not happen." + exit 1 +fi + +# Add container hostnames to the known_hosts file to avoid SSH warnings +if [[ "${MULTINODE:-false}" == "true" ]]; then + ssh-keyscan -t rsa cdw scdw sdw1 sdw2 > /home/gpadmin/.ssh/known_hosts 2>/dev/null || true +else + ssh-keyscan -t rsa cdw > /home/gpadmin/.ssh/known_hosts 2>/dev/null || true +fi +chmod 600 /home/gpadmin/.ssh/known_hosts +chown gpadmin:gpadmin /home/gpadmin/.ssh/known_hosts + +# Load Cloudberry/Greenplum environment with fallback, then ensure PATH +if [ -f "/usr/local/cloudberry-db/cloudberry-env.sh" ]; then + # shellcheck disable=SC1091 + . /usr/local/cloudberry-db/cloudberry-env.sh +elif [ -f "/usr/local/cloudberry-db/greenplum_path.sh" ]; then + # shellcheck disable=SC1091 + . /usr/local/cloudberry-db/greenplum_path.sh +else + # Fallback: minimal env to find gp* tools + export GPHOME="/usr/local/cloudberry-db" +fi +# Ensure coordinator data dir variable is set +export COORDINATOR_DATA_DIRECTORY="${COORDINATOR_DATA_DIRECTORY:-/data0/database/coordinator/gpseg-1}" +# Ensure PATH includes Cloudberry bin +if [ -d "/usr/local/cloudberry-db/bin" ]; then + case ":$PATH:" in + *":/usr/local/cloudberry-db/bin:"*) : ;; + *) export PATH="/usr/local/cloudberry-db/bin:$PATH" ;; + esac +fi + +# Initialize single node Cloudberry cluster +if [[ "${MULTINODE:-false}" == "false" && "$HOSTNAME" == "cdw" ]]; then + gpinitsystem -a \ + -c /tmp/gpinitsystem_singlenode \ + -h /tmp/gpdb-hosts \ + --max_connections=100 +# Initialize multi node Cloudberry cluster +elif [[ "${MULTINODE:-false}" == "true" && "$HOSTNAME" == "cdw" ]]; then + # Wait for other containers' SSH to become reachable (max 120s per host) + for host in sdw1 sdw2 scdw; do + MAX_WAIT=120 + WAITED=0 + until ssh -o StrictHostKeyChecking=no -o PasswordAuthentication=no -o ConnectTimeout=5 gpadmin@${host} "echo Connected to ${host}" 2>/dev/null; do + if [ $WAITED -ge $MAX_WAIT ]; then + echo "Timeout waiting for SSH on ${host}" + exit 1 + fi + sleep 5 + WAITED=$((WAITED+5)) + done + done + + # Clean up any existing data directories to avoid conflicts + rm -rf /data0/database/coordinator/* /data0/database/primary/* /data0/database/mirror/* 2>/dev/null || true + + # Ensure database directories exist with proper permissions + mkdir -p /data0/database/coordinator /data0/database/primary /data0/database/mirror + chmod -R 700 /data0/database + + gpinitsystem -a \ + -c /tmp/gpinitsystem_multinode \ + -h /tmp/multinode-gpinit-hosts \ + --max_connections=100 + gpinitstandby -s scdw -a + printf "sdw1\nsdw2\n" >> /tmp/gpdb-hosts +fi + +# ---------------------------------------------------------------------- +# Post-initialization configuration (applies to both single and multi-node) +# ---------------------------------------------------------------------- +# Configure pg_hba.conf to allow passwordless access from any host, +# remove password requirement for gpadmin user, and display cluster info. +# This section runs on the coordinator node after cluster initialization. +# ---------------------------------------------------------------------- +if [ "$HOSTNAME" == "cdw" ]; then + ## Allow any host access the Cloudberry Cluster + echo 'host all all 0.0.0.0/0 trust' >> /data0/database/coordinator/gpseg-1/pg_hba.conf + gpstop -u + + cat <<-'EOF' + +====================================================================== + ____ _ _ _ + / ___| | ___ _ _ __| | |__ ___ _ __ _ __ _ _ + | | | |/ _ \| | | |/ _` | '_ \ / _ \ '__| '__| | | | + | |___| | (_) | |_| | (_| | |_) | __/ | | | | |_| | + \____|_|\___/ \__,_|\__,_|_.__/ \___|_| |_| \__, | + |___/ +====================================================================== +EOF + + cat <<-'EOF' + +====================================================================== +Sandbox: Apache Cloudberry Cluster details +====================================================================== + +EOF + + echo "Current time: $(date)" + source /etc/os-release + echo "OS Version: ${NAME} ${VERSION}" + + ## Display version and cluster configuration + psql -P pager=off -d template1 -c "SELECT VERSION()" + psql -P pager=off -d template1 -c "SELECT * FROM gp_segment_configuration ORDER BY dbid" + psql -P pager=off -d template1 -c "SHOW optimizer" +fi + +echo """ +=========================== += DEPLOYMENT SUCCESSFUL = +=========================== +""" + +# ---------------------------------------------------------------------- +# Start an interactive bash shell +# ---------------------------------------------------------------------- +# Finally, the script starts an interactive bash shell to keep the +# container running and allow the user to interact with the environment. +# ---------------------------------------------------------------------- +/bin/bash diff --git a/devops/sandbox/configs/multinode-gpinit-hosts b/devops/sandbox/configs/multinode-gpinit-hosts new file mode 100644 index 00000000000..6da00621266 --- /dev/null +++ b/devops/sandbox/configs/multinode-gpinit-hosts @@ -0,0 +1,2 @@ +sdw1 +sdw2 diff --git a/devops/sandbox/docker-compose-rockylinux9.yml b/devops/sandbox/docker-compose-rockylinux9.yml new file mode 100644 index 00000000000..c37471898d1 --- /dev/null +++ b/devops/sandbox/docker-compose-rockylinux9.yml @@ -0,0 +1,81 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- + +services: + cbdb-coordinator: + container_name: cbdb-cdw + image: cbdb-${CODEBASE_VERSION}:${OS_VERSION} + ports: + - "15432:5432" + hostname: cdw + tty: true + networks: + interconnect: + ipv4_address: 10.5.0.10 + environment: + MULTINODE: "true" + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:ro + + cbdb-standby-coordinator: + container_name: cbdb-scdw + image: cbdb-${CODEBASE_VERSION}:${OS_VERSION} + hostname: scdw + tty: true + networks: + interconnect: + ipv4_address: 10.5.0.11 + environment: + MULTINODE: "true" + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:ro + cbdb-segment-host-1: + container_name: cbdb-sdw1 + image: cbdb-${CODEBASE_VERSION}:${OS_VERSION} + hostname: sdw1 + tty: true + networks: + interconnect: + ipv4_address: 10.5.0.12 + environment: + MULTINODE: "true" + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:ro + cbdb-segment-host-2: + container_name: cbdb-sdw2 + image: cbdb-${CODEBASE_VERSION}:${OS_VERSION} + hostname: sdw2 + tty: true + networks: + interconnect: + ipv4_address: 10.5.0.13 + environment: + MULTINODE: "true" + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:ro + +networks: + interconnect: + name: cbdb-interconnect + driver: bridge + ipam: + config: + - subnet: 10.5.0.0/16 + gateway: 10.5.0.1 diff --git a/devops/sandbox/run.sh b/devops/sandbox/run.sh new file mode 100755 index 00000000000..7c266b8f64c --- /dev/null +++ b/devops/sandbox/run.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +set -euo pipefail + +# Default values +DEFAULT_OS_VERSION="rockylinux9" +DEFAULT_TIMEZONE_VAR="America/Los_Angeles" +DEFAULT_PIP_INDEX_URL_VAR="https://pypi.org/simple" +BUILD_ONLY="false" +MULTINODE="false" + +# Use environment variables if set, otherwise use default values +# Export set for some variables to be used referenced docker compose file +export OS_VERSION="${OS_VERSION:-$DEFAULT_OS_VERSION}" +BUILD_ONLY="${BUILD_ONLY:-false}" +export CODEBASE_VERSION="${CODEBASE_VERSION:-}" +TIMEZONE_VAR="${TIMEZONE_VAR:-$DEFAULT_TIMEZONE_VAR}" +PIP_INDEX_URL_VAR="${PIP_INDEX_URL_VAR:-$DEFAULT_PIP_INDEX_URL_VAR}" + +# Function to display help message +function usage() { + echo "Usage: $0 [-o ] [-c ] [-b] [-m]" + echo " -c Codebase version (valid values: main, local, or other available version like 2.0.0)" + echo " -t Timezone (default: America/Los_Angeles, or set via TIMEZONE_VAR environment variable)" + echo " -p Python Package Index (PyPI) (default: https://pypi.org/simple, or set via PIP_INDEX_URL_VAR environment variable)" + echo " -b Build only, do not run the container (default: false, or set via BUILD_ONLY environment variable)" + echo " -m Multinode, this creates a multinode (multi-container) Cloudberry cluster using docker compose (requires compose to be installed)" + exit 1 +} + +# Parse command-line options +while getopts "c:t:p:bmh" opt; do + case "${opt}" in + c) + CODEBASE_VERSION=${OPTARG} + ;; + t) + TIMEZONE_VAR=${OPTARG} + ;; + p) + PIP_INDEX_URL_VAR=${OPTARG} + ;; + b) + BUILD_ONLY="true" + ;; + m) + MULTINODE="true" + ;; + h) + usage + ;; + *) + usage + ;; + esac +done + +if [[ $MULTINODE == "true" ]] && ! docker compose version; then + echo "Error: Multinode -m flag found in run arguments but calling docker compose failed. Please install Docker Compose by following the instructions at https://docs.docker.com/compose/install/. Exiting" + exit 1 +fi + +if [[ "${MULTINODE}" == "true" && "${BUILD_ONLY}" == "true" ]]; then + echo "Error: Cannot pass both multinode deployment [m] and build only [b] flags together" + exit 1 +fi + +# CODEBASE_VERSION must be specified via -c argument or CODEBASE_VERSION environment variable +if [[ -z "$CODEBASE_VERSION" ]]; then + echo "Error: CODEBASE_VERSION must be specified via environment variable or '-c' command line parameter." + usage +fi + +# Validate OS_VERSION and map to appropriate Docker image +case "${OS_VERSION}" in + rockylinux9) + OS_DOCKER_IMAGE="rockylinux9" + ;; + *) + echo "Invalid OS version: ${OS_VERSION}" + usage + ;; +esac + +# Validate CODEBASE_VERSION +if [[ "${CODEBASE_VERSION}" != "main" && "${CODEBASE_VERSION}" != "local" && ! "${CODEBASE_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "Invalid codebase version: ${CODEBASE_VERSION}" + usage +fi + +# Determine sandbox directory and repository root +SANDBOX_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SANDBOX_DIR}/../.." && pwd)" + +# Ensure submodules are initialized for local builds +if [[ "${CODEBASE_VERSION}" = "local" ]]; then + if [[ -d "${REPO_ROOT}/.git" ]] && command -v git >/dev/null 2>&1; then + # Check if any submodules are uninitialized (-), out of sync (+), or have conflicts (U) + if (cd "${REPO_ROOT}" && git submodule status --recursive | grep -qE "^[+-U]"); then + echo "Updating git submodules for local build..." + (cd "${REPO_ROOT}" && git submodule update --init --recursive) + else + echo "Git submodules are already up to date. Skipping update." + fi + else + echo "Warning: Skipping 'git submodule update --init --recursive' for local build because either '.git' directory or 'git' command is missing in ${REPO_ROOT}." + echo "If your Cloudberry checkout relies on git submodules, please ensure they have been populated before running with '-c local'." + fi +fi + + +# Build image +if [[ "${CODEBASE_VERSION}" = "main" || "${CODEBASE_VERSION}" = "local" ]]; then + DOCKERFILE="${SANDBOX_DIR}/Dockerfile.main.${OS_VERSION}" + + # Single image build from main or local source + docker build --file ${DOCKERFILE} \ + --build-arg TIMEZONE_VAR="${TIMEZONE_VAR}" \ + --build-arg CODEBASE_VERSION="${CODEBASE_VERSION}" \ + --tag cbdb-${CODEBASE_VERSION}:${OS_VERSION} \ + ${REPO_ROOT} +else + DOCKERFILE="${SANDBOX_DIR}/Dockerfile.RELEASE.${OS_VERSION}" + + docker build --file ${DOCKERFILE} \ + --build-arg TIMEZONE_VAR="${TIMEZONE_VAR}" \ + --build-arg CODEBASE_VERSION_VAR="${CODEBASE_VERSION}" \ + --tag cbdb-${CODEBASE_VERSION}:${OS_VERSION} \ + ${SANDBOX_DIR} +fi + +# Check if build only flag is set +if [ "${BUILD_ONLY}" == "true" ]; then + echo "Docker image built successfully with OS version ${OS_VERSION} and codebase version ${CODEBASE_VERSION}. Build only mode, not running the container." + exit 0 +fi + +# Deploy container(s) +if [ "${MULTINODE}" == "true" ]; then + docker compose -f docker-compose-$OS_VERSION.yml up --detach +else + docker run --interactive \ + --tty \ + --name cbdb-cdw \ + --detach \ + --volume /sys/fs/cgroup:/sys/fs/cgroup:ro \ + --publish 122:22 \ + --publish 15432:5432 \ + --hostname cdw \ + cbdb-${CODEBASE_VERSION}:${OS_VERSION} +fi diff --git a/devops/sandbox/sandbox-deployment.jpg b/devops/sandbox/sandbox-deployment.jpg new file mode 100644 index 00000000000..bb1b2dc0741 Binary files /dev/null and b/devops/sandbox/sandbox-deployment.jpg differ diff --git a/devops/tools/elf_rockylinux_dependency_analyzer.py b/devops/tools/elf_rockylinux_dependency_analyzer.py new file mode 100755 index 00000000000..593dd169aa6 --- /dev/null +++ b/devops/tools/elf_rockylinux_dependency_analyzer.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +ELF Dependency Analyzer + +This script analyzes ELF (Executable and Linkable Format) binaries to determine their runtime +package dependencies. It can process individual files or recursively analyze directories. + +The script provides information about: +- Required packages and their versions +- Missing libraries +- Custom or non-RPM libraries +- Other special cases + +It also groups packages by their high-level dependencies, which can be cached for performance. + +Usage: + python3 elf_dependency_analyzer.py [--rebuild-cache] [ ...] + +The script will automatically determine if each argument is a file or directory and process accordingly. +Use --rebuild-cache to force rebuilding of the high-level packages cache. + +Requirements: +- Python 3.6+ +- prettytable (pip install prettytable) +- python-dateutil (pip install python-dateutil) +- ldd (usually pre-installed on Linux systems) +- file (usually pre-installed on Linux systems) +- rpm (usually pre-installed on RPM-based Linux distributions) +- repoquery (part of yum-utils package) + +Functions: +- check_requirements(): Checks if all required commands are available. +- run_command(command): Executes a shell command and returns its output. +- parse_ldd_line(line): Parses a line of ldd output to extract the library name. +- find_library_in_ld_library_path(lib_name): Searches for a library in LD_LIBRARY_PATH. +- get_package_info(lib_path): Gets package information for a given library. +- get_package_dependencies(package): Gets dependencies of a package using repoquery. +- build_high_level_packages(grand_summary): Builds a mapping of high-level packages to their dependencies. +- load_or_build_high_level_packages(grand_summary, force_rebuild): Loads or builds the high-level packages cache. +- print_summary(packages, special_cases, missing_libraries, binary_path): Prints a summary for a single binary. +- process_binary(binary_path): Processes a single binary file. +- is_elf_binary(file_path): Checks if a file is an ELF binary. +- print_grand_summary(...): Prints a grand summary of all processed binaries. +- analyze_path(path, grand_summary, grand_special_cases, grand_missing_libraries): Analyzes a file or directory. +- main(): Main function to handle command-line arguments and initiate the analysis. + +This script is designed to help system administrators and developers understand the dependencies +of ELF binaries in their systems, which can be useful for troubleshooting, optimizing, or +preparing deployment packages. +""" + +import os, subprocess, re, sys, json, shutil +from collections import defaultdict +from datetime import datetime, timedelta +import argparse +from prettytable import PrettyTable +from dateutil import parser + +CACHE_FILE = 'high_level_packages_cache.json' +CACHE_EXPIRY_DAYS = 7 + +def check_requirements(): + required_commands = ['ldd', 'file', 'rpm', 'repoquery'] + missing_commands = [cmd for cmd in required_commands if shutil.which(cmd) is None] + if missing_commands: + print("Error: The following required commands are missing:") + for cmd in missing_commands: + print(f" - {cmd}") + print("\nPlease install these commands and try again.") + if 'repoquery' in missing_commands: + print("Note: 'repoquery' is typically part of the 'yum-utils' package.") + sys.exit(1) + +def run_command(command): + try: + return subprocess.check_output(command, stderr=subprocess.STDOUT).decode('utf-8') + except subprocess.CalledProcessError as e: + print(f"Error running command {' '.join(command)}: {e.output.decode('utf-8').strip()}") + return None + +def parse_ldd_line(line): + match = re.search(r'\s*(\S+) => (\S+) \((0x[0-9a-f]+)\)', line) + return match.group(1) if match else None + +def find_library_in_ld_library_path(lib_name): + ld_library_path = os.environ.get('LD_LIBRARY_PATH', '') + for directory in ld_library_path.split(':'): + potential_path = os.path.join(directory, lib_name) + if os.path.isfile(potential_path): + return potential_path + return None + +def get_package_info(lib_path): + if not os.path.isfile(lib_path): + lib_name = os.path.basename(lib_path) + lib_path = find_library_in_ld_library_path(lib_name) + if not lib_path: + return None + try: + full_package_name = run_command(['rpm', '-qf', lib_path]) + if full_package_name: + package_name = full_package_name.split('-')[0] + return package_name, full_package_name.strip() + except subprocess.CalledProcessError: + pass + return None + +def get_package_dependencies(package): + try: + output = subprocess.check_output(['repoquery', '--requires', '--resolve', package], + universal_newlines=True, stderr=subprocess.DEVNULL) + return set(output.strip().split('\n')) + except subprocess.CalledProcessError: + return set() + +def build_high_level_packages(grand_summary): + all_packages = set() + for packages in grand_summary.values(): + all_packages.update(package.split('-')[0] for package in packages) + high_level_packages = {} + for package in all_packages: + deps = get_package_dependencies(package) + if deps: + high_level_packages[package] = [dep.split('-')[0] for dep in deps] + return high_level_packages + +def load_or_build_high_level_packages(grand_summary, force_rebuild=False): + if not force_rebuild and os.path.exists(CACHE_FILE): + with open(CACHE_FILE, 'r') as f: + cache_data = json.load(f) + if datetime.now() - parser.parse(cache_data['timestamp']) < timedelta(days=CACHE_EXPIRY_DAYS): + return cache_data['packages'] + packages = build_high_level_packages(grand_summary) + with open(CACHE_FILE, 'w') as f: + json.dump({'timestamp': datetime.now().isoformat(), 'packages': packages}, f) + return packages + +def print_summary(packages, special_cases, missing_libraries, binary_path): + print("\nSummary of unique runtime packages required:") + table = PrettyTable(['Package Name', 'Full Package Name']) + table.align['Package Name'] = 'l' + table.align['Full Package Name'] = 'l' + unique_packages = sorted(set(packages)) + for package_name, full_package_name in unique_packages: + table.add_row([package_name, full_package_name]) + print(table) + if missing_libraries: + print("\nMISSING LIBRARIES:") + missing_table = PrettyTable(['Missing Library', 'Referenced By']) + missing_table.align['Missing Library'] = 'l' + missing_table.align['Referenced By'] = 'l' + for lib in missing_libraries: + missing_table.add_row([lib, binary_path]) + print(missing_table) + if special_cases: + print("\nSPECIAL CASES:") + special_table = PrettyTable(['Library/Case', 'Referenced By', 'Category']) + special_table.align['Library/Case'] = 'l' + special_table.align['Referenced By'] = 'l' + special_table.align['Category'] = 'l' + for case in special_cases: + category = "Custom/Non-RPM" if "custom or non-RPM library" in case else "Other" + library = case.split(" is ")[0] if " is " in case else case + special_table.add_row([library, binary_path, category]) + print(special_table) + else: + print("\nSPECIAL CASES: None found") + +def process_binary(binary_path): + print(f"Binary: {binary_path}\n") + print("Libraries and their corresponding packages:") + packages, special_cases, missing_libraries = [], [], [] + known_special_cases = ['linux-vdso.so.1', 'ld-linux-x86-64.so.2'] + ldd_output = run_command(['ldd', binary_path]) + if ldd_output is None: + return packages, special_cases, missing_libraries + for line in ldd_output.splitlines(): + if any(special in line for special in known_special_cases): + continue + parts = line.split('=>') + lib_name = parts[0].strip() + if "not found" in line: + missing_libraries.append(lib_name) + print(f"MISSING: {line.strip()}") + else: + if len(parts) > 1: + lib_path = parts[1].split()[0] + if lib_path != "not": + package_info = get_package_info(lib_path) + if package_info: + print(f"{lib_path} => {package_info[1]}") + packages.append(package_info) + else: + if os.path.exists(lib_path): + special_case = f"{lib_path} is a custom or non-RPM library" + special_cases.append(special_case) + print(f"{lib_path} => Custom or non-RPM library") + else: + special_case = f"{lib_path} is not found and might be a special case" + special_cases.append(special_case) + print(f"{lib_path} => Not found, might be a special case") + else: + special_case = f"{line.strip()} is a special case or built-in library" + special_cases.append(special_case) + print(f"{line.strip()} => Special case or built-in library") + else: + special_case = f"{line.strip()} is a special case or built-in library" + special_cases.append(special_case) + print(f"{line.strip()} => Special case or built-in library") + if special_cases: + print(f"Special cases found for {binary_path}:") + for case in special_cases: + print(f" - {case}") + else: + print(f"No special cases found for {binary_path}") + print_summary(packages, special_cases, missing_libraries, binary_path) + print("-------------------------------------------") + return packages, special_cases, missing_libraries + +def is_elf_binary(file_path): + file_output = run_command(['file', file_path]) + return 'ELF' in file_output and ('executable' in file_output or 'shared object' in file_output) + +def print_grand_summary(grand_summary, grand_special_cases, grand_missing_libraries, HIGH_LEVEL_PACKAGES, PACKAGE_TO_HIGH_LEVEL): + if grand_summary or grand_special_cases or grand_missing_libraries: + print("\nGrand Summary of high-level runtime packages required across all binaries:") + high_level_summary = defaultdict(set) + for package_name, full_package_names in grand_summary.items(): + high_level_package = PACKAGE_TO_HIGH_LEVEL.get(package_name.split('-')[0], package_name.split('-')[0]) + high_level_summary[high_level_package].update(full_package_names) + table = PrettyTable(['High-Level Package', 'Included Packages']) + table.align['High-Level Package'] = 'l' + table.align['Included Packages'] = 'l' + for high_level_package, full_package_names in sorted(high_level_summary.items()): + included_packages = '\n'.join(sorted(full_package_names)) + table.add_row([high_level_package, included_packages]) + print(table) + if grand_missing_libraries: + print("\nGrand Summary of MISSING LIBRARIES across all binaries:") + missing_table = PrettyTable(['Missing Library', 'Referenced By']) + missing_table.align['Missing Library'] = 'l' + missing_table.align['Referenced By'] = 'l' + for lib, binaries in sorted(grand_missing_libraries.items()): + missing_table.add_row([lib, '\n'.join(sorted(binaries))]) + print(missing_table) + print("\nGrand Summary of special cases across all binaries:") + if grand_special_cases: + special_table = PrettyTable(['Library/Case', 'Referenced By', 'Category']) + special_table.align['Library/Case'] = 'l' + special_table.align['Referenced By'] = 'l' + special_table.align['Category'] = 'l' + for case, binary in sorted(set(grand_special_cases)): + category = "Custom/Non-RPM" if "custom or non-RPM library" in case else "Other" + library = case.split(" is ")[0] if " is " in case else case + special_table.add_row([library, binary, category]) + print(special_table) + else: + print("No special cases found.") + +def analyze_path(path, grand_summary, grand_special_cases, grand_missing_libraries): + if os.path.isfile(path): + packages, special_cases, missing_libraries = process_binary(path) + for package_name, full_package_name in packages: + grand_summary[package_name].add(full_package_name) + grand_special_cases.extend((case, path) for case in special_cases) + for lib in missing_libraries: + grand_missing_libraries[lib].add(path) + elif os.path.isdir(path): + for root, dirs, files in os.walk(path): + for file in files: + file_path = os.path.join(root, file) + if is_elf_binary(file_path): + packages, special_cases, missing_libraries = process_binary(file_path) + for package_name, full_package_name in packages: + grand_summary[package_name].add(full_package_name) + grand_special_cases.extend((case, file_path) for case in special_cases) + for lib in missing_libraries: + grand_missing_libraries[lib].add(file_path) + else: + print(f"Error: {path} is neither a valid file nor a directory.") + if grand_special_cases: + print(f"Accumulated special cases after processing {path}:") + for case, binary in grand_special_cases: + print(f" - {case} (in {binary})") + else: + print(f"No special cases accumulated after processing {path}") + +def main(): + check_requirements() + parser = argparse.ArgumentParser(description="ELF Dependency Analyzer") + parser.add_argument('paths', nargs='+', help="Paths to files or directories to analyze") + parser.add_argument('--rebuild-cache', action='store_true', help="Force rebuild of the high-level packages cache") + args = parser.parse_args() + grand_summary = defaultdict(set) + grand_special_cases = [] + grand_missing_libraries = defaultdict(set) + for path in args.paths: + analyze_path(path, grand_summary, grand_special_cases, grand_missing_libraries) + HIGH_LEVEL_PACKAGES = load_or_build_high_level_packages(grand_summary, args.rebuild_cache) + PACKAGE_TO_HIGH_LEVEL = {low: high for high, lows in HIGH_LEVEL_PACKAGES.items() for low in lows} + print_grand_summary(grand_summary, grand_special_cases, grand_missing_libraries, HIGH_LEVEL_PACKAGES, PACKAGE_TO_HIGH_LEVEL) + +if __name__ == '__main__': + main() diff --git a/devops/tools/elf_ubuntu_dependency_analyzer.py b/devops/tools/elf_ubuntu_dependency_analyzer.py new file mode 100755 index 00000000000..a1741f7f888 --- /dev/null +++ b/devops/tools/elf_ubuntu_dependency_analyzer.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +ELF Dependency Analyzer for Ubuntu + +This script analyzes ELF (Executable and Linkable Format) binaries to determine their runtime +package dependencies on Ubuntu systems. It can process individual files or recursively analyze directories. + +The script provides information about: +- Required packages and their versions +- Custom or non-APT libraries +- Core system libraries +- Missing libraries +- Other special cases + +Usage: + python3 elf_dependency_analyzer.py [file_or_directory] [file_or_directory] ... + +Requirements: +- Python 3.6+ +- prettytable (pip install prettytable) +- ldd (usually pre-installed on Linux systems) +- file (usually pre-installed on Linux systems) +- dpkg (pre-installed on Ubuntu) +""" + +import os +import subprocess +import re +import sys +import argparse +from collections import defaultdict +from prettytable import PrettyTable + +def run_command(command): + """ + Execute a shell command and return its output. + + Args: + command (list): The command to execute as a list of strings. + + Returns: + str: The output of the command, or None if an error occurred. + """ + try: + return subprocess.check_output(command, stderr=subprocess.STDOUT).decode('utf-8') + except subprocess.CalledProcessError as e: + print(f"Error running command {' '.join(command)}: {e.output.decode('utf-8').strip()}") + return None + +def get_package_info(lib_path): + """ + Get package information for a given library path. + + Args: + lib_path (str): The path to the library. + + Returns: + tuple: A tuple containing the package name and full package information. + """ + if lib_path.startswith('/usr/local/cloudberry-db'): + return "cloudberry-custom", f"Cloudberry custom library: {lib_path}" + + dpkg_output = run_command(['dpkg', '-S', lib_path]) + if dpkg_output: + package_name = dpkg_output.split(':')[0] + return package_name, dpkg_output.strip() + + # List of core system libraries that might not be individually tracked by dpkg + core_libs = { + 'libc.so': 'libc6', + 'libm.so': 'libc6', + 'libdl.so': 'libc6', + 'libpthread.so': 'libc6', + 'libresolv.so': 'libc6', + 'librt.so': 'libc6', + 'libgcc_s.so': 'libgcc-s1', + 'libstdc++.so': 'libstdc++6', + 'libz.so': 'zlib1g', + 'libbz2.so': 'libbz2-1.0', + 'libpam.so': 'libpam0g', + 'libaudit.so': 'libaudit1', + 'libcap-ng.so': 'libcap-ng0', + 'libkeyutils.so': 'libkeyutils1', + 'liblzma.so': 'liblzma5', + 'libcom_err.so': 'libcomerr2' + } + + lib_name = os.path.basename(lib_path) + for core_lib, package in core_libs.items(): + if lib_name.startswith(core_lib): + return package, f"Core system library: {lib_path}" + + # If not a recognized core library, return as system library + file_output = run_command(['file', lib_path]) + if file_output: + return "system-library", f"System library: {lib_path} - {file_output.strip()}" + + return None + +def print_summary(packages, special_cases, missing_libraries, binary_path): + """ + Print a summary of the dependencies for a binary. + + Args: + packages (list): List of package tuples (package_name, full_package_name). + special_cases (list): List of special case strings. + missing_libraries (list): List of missing library names. + binary_path (str): Path to the binary being analyzed. + """ + print("\nSummary of runtime dependencies:") + table = PrettyTable(['Category', 'Package/Library', 'Details']) + table.align['Category'] = 'l' + table.align['Package/Library'] = 'l' + table.align['Details'] = 'l' + + categories = { + 'cloudberry-custom': 'Cloudberry Custom', + 'system-library': 'System Library', + } + + for package_name, full_package_name in sorted(set(packages)): + category = categories.get(package_name, 'System Package') + table.add_row([category, package_name, full_package_name]) + + print(table) + + if missing_libraries: + print("\nMISSING LIBRARIES:") + for lib in missing_libraries: + print(f" - {lib}") + + if special_cases: + print("\nSPECIAL CASES:") + for case in special_cases: + print(f" - {case}") + +def process_binary(binary_path): + """ + Process a single binary file to determine its dependencies. + + Args: + binary_path (str): Path to the binary file. + + Returns: + tuple: A tuple containing lists of packages, special cases, and missing libraries. + """ + print(f"Binary: {binary_path}\n") + print("Libraries and their corresponding packages:") + packages, special_cases, missing_libraries = [], [], [] + + ldd_output = run_command(['ldd', binary_path]) + if ldd_output is None: + return packages, special_cases, missing_libraries + + for line in ldd_output.splitlines(): + if "=>" not in line: + continue + + parts = line.split('=>') + lib_name = parts[0].strip() + lib_path = parts[1].split()[0].strip() + lib_path = os.path.realpath(lib_path) + + if lib_path == "not": + missing_libraries.append(lib_name) + print(f"MISSING: {line.strip()}") + else: + package_info = get_package_info(lib_path) + if package_info: + print(f"{lib_path} => {package_info[1]}") + packages.append(package_info) + else: + special_case = f"{lib_path} is not found and might be a special case" + special_cases.append(special_case) + print(f"{lib_path} => Not found, might be a special case") + + print_summary(packages, special_cases, missing_libraries, binary_path) + print("-------------------------------------------") + return packages, special_cases, missing_libraries + +def is_elf_binary(file_path): + """ + Check if a file is an ELF binary. + + Args: + file_path (str): Path to the file. + + Returns: + bool: True if the file is an ELF binary, False otherwise. + """ + file_output = run_command(['file', file_path]) + return 'ELF' in file_output and ('executable' in file_output or 'shared object' in file_output) + +def print_grand_summary(grand_summary, grand_special_cases, grand_missing_libraries): + """ + Print a grand summary of all analyzed binaries. + + Args: + grand_summary (dict): Dictionary of all packages and their details. + grand_special_cases (list): List of all special cases. + grand_missing_libraries (dict): Dictionary of all missing libraries. + """ + if grand_summary or grand_special_cases or grand_missing_libraries: + print("\nGrand Summary of runtime packages required across all binaries:") + table = PrettyTable(['Package', 'Included Packages']) + table.align['Package'] = 'l' + table.align['Included Packages'] = 'l' + for package_name, full_package_names in sorted(grand_summary.items()): + included_packages = '\n'.join(sorted(full_package_names)) + table.add_row([package_name, included_packages]) + print(table) + + if grand_missing_libraries: + print("\nGrand Summary of MISSING LIBRARIES across all binaries:") + missing_table = PrettyTable(['Missing Library', 'Referenced By']) + missing_table.align['Missing Library'] = 'l' + missing_table.align['Referenced By'] = 'l' + for lib, binaries in sorted(grand_missing_libraries.items()): + missing_table.add_row([lib, '\n'.join(sorted(binaries))]) + print(missing_table) + + print("\nGrand Summary of special cases across all binaries:") + if grand_special_cases: + special_table = PrettyTable(['Library/Case', 'Referenced By', 'Category']) + special_table.align['Library/Case'] = 'l' + special_table.align['Referenced By'] = 'l' + special_table.align['Category'] = 'l' + for case, binary in sorted(set(grand_special_cases)): + category = "System Library" if "system library" in case else "Other" + library = case.split(" is ")[0] if " is " in case else case + special_table.add_row([library, binary, category]) + print(special_table) + else: + print("No special cases found.") + +def analyze_path(path, grand_summary, grand_special_cases, grand_missing_libraries): + """ + Analyze a file or directory for ELF binaries and their dependencies. + + Args: + path (str): Path to the file or directory to analyze. + grand_summary (dict): Dictionary to store all package information. + grand_special_cases (list): List to store all special cases. + grand_missing_libraries (dict): Dictionary to store all missing libraries. + """ + if os.path.isfile(path): + if is_elf_binary(path): + packages, special_cases, missing_libraries = process_binary(path) + for package_name, full_package_name in packages: + grand_summary[package_name].add(full_package_name) + grand_special_cases.extend((case, path) for case in special_cases) + for lib in missing_libraries: + grand_missing_libraries[lib].add(path) + elif os.path.isdir(path): + for root, dirs, files in os.walk(path): + for file in files: + file_path = os.path.join(root, file) + if is_elf_binary(file_path): + packages, special_cases, missing_libraries = process_binary(file_path) + for package_name, full_package_name in packages: + grand_summary[package_name].add(full_package_name) + grand_special_cases.extend((case, file_path) for case in special_cases) + for lib in missing_libraries: + grand_missing_libraries[lib].add(file_path) + else: + print(f"Error: {path} is neither a valid file nor a directory.") + +def main(): + """ + Main function to handle command-line arguments and initiate the analysis. + """ + parser = argparse.ArgumentParser(description="ELF Dependency Analyzer for Ubuntu") + parser.add_argument('paths', nargs='+', help="Paths to files or directories to analyze") + args = parser.parse_args() + + grand_summary = defaultdict(set) + grand_special_cases = [] + grand_missing_libraries = defaultdict(set) + + for path in args.paths: + analyze_path(path, grand_summary, grand_special_cases, grand_missing_libraries) + + print_grand_summary(grand_summary, grand_special_cases, grand_missing_libraries) + +if __name__ == '__main__': + main() diff --git a/devops/tools/s3-repo-sync-and-sign.sh b/devops/tools/s3-repo-sync-and-sign.sh new file mode 100755 index 00000000000..1cd037749c6 --- /dev/null +++ b/devops/tools/s3-repo-sync-and-sign.sh @@ -0,0 +1,266 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -euo pipefail + +# Description: +# This script automates several tasks related to managing RPM repositories in AWS S3. +# It handles the following operations: +# 1. Syncing an RPM repository from an S3 bucket to a local directory. +# 2. Signing all RPMs in the local repository with a specified GPG key. +# 3. Updating and signing the repository metadata. +# 4. Exporting the GPG public key and placing it in the repository for client use. +# 5. Optionally, uploading changes back to the S3 bucket and deleting files in S3 that no longer exist locally. +# 6. Decrypting and importing a GPG private key used for signing. +# 7. A mode to only decrypt and import the GPG private key. +# 8. Identifying and copying a newly built RPM to the appropriate repository. + +# Function to display detailed usage information +usage() { + cat << EOF +Usage: $0 [OPTIONS] + +This script automates several tasks related to managing RPM repositories in AWS S3. +It can be used to sync repositories from S3, sign RPMs with a GPG key, update and sign repository metadata, +and optionally upload changes back to S3. + +Options: + -c Configure AWS credentials using 'aws configure'. + -s Specify the S3 bucket and path to sync (required for S3 operations). + -d Specify the local directory to sync to (default: ~/repo). + -k Specify the encrypted GPG private key file to import (optional). + -g Specify the GPG key ID or email to use for signing (required for signing operations). + --upload-with-delete Sync local changes to S3, deleting files in S3 that no longer exist locally. + --s3-sync-only Perform only the S3 sync to the local directory, inform the user, and exit. + --import-gpg-key-only Decrypt and import the GPG private key, then exit. No other operations will be performed. + --copy-new-rpm Copy the newly built RPM(s) to the appropriate repository directory based on architecture and version. + -h, --help Display this help message and exit. + +Examples: + # Sync an S3 repository to a local directory and sign RPMs with a GPG key + $0 -s s3://mybucket/repo -g mygpgkey@example.com + + # Sync an S3 repository only, without signing RPMs or performing other operations + $0 -s s3://mybucket/repo --s3-sync-only + + # Decrypt and import a GPG private key, then exit + $0 -k ~/path/to/encrypted-gpg-key.asc --import-gpg-key-only + + # Copy newly built RPMs to the appropriate repository and sign them + $0 --copy-new-rpm -g mygpgkey@example.com + +Notes: + - The -s option is required for any operation that interacts with S3, such as syncing or uploading with delete. + - The -g option is required for any operation that involves signing RPMs or repository metadata. + - When using --upload-with-delete, ensure that you have the necessary permissions to delete objects in the specified S3 bucket. + - If you only want to perform local operations (e.g., copying RPMs, signing), you do not need to specify the -s option. + +EOF +} + +# Parse options and arguments +GPG_KEY_ID="" +UPLOAD_WITH_DELETE=false +S3_SYNC_ONLY=false +IMPORT_GPG_KEY_ONLY=false +COPY_NEW_RPM=false +CONFIGURE_AWS=false +LOCAL_DIR=~/repo + +# Function to check if required commands are available +check_commands() { + local cmds=("aws" "gpg" "shred" "createrepo" "rpm" "find") + for cmd in "${cmds[@]}"; do + if ! command -v "$cmd" &> /dev/null; then + echo "Error: Required command '$cmd' not found. Please install it before running the script." + exit 1 + fi + done +} + +# Parse options +while [[ "$#" -gt 0 ]]; do + case $1 in + -c) CONFIGURE_AWS=true; shift ;; + -s) S3_BUCKET="$2"; shift 2 ;; + -d) LOCAL_DIR="$2"; shift 2 ;; + -k) ENCRYPTED_KEY_FILE="$2"; shift 2 ;; + -g) GPG_KEY_ID="$2"; shift 2 ;; + --upload-with-delete) UPLOAD_WITH_DELETE=true; shift ;; + --s3-sync-only) S3_SYNC_ONLY=true; shift ;; + --import-gpg-key-only) IMPORT_GPG_KEY_ONLY=true; shift ;; + --copy-new-rpm) COPY_NEW_RPM=true; shift ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown option: $1"; usage; exit 1 ;; + esac +done + +check_commands + +# AWS credentials configuration (optional) +if [ "$CONFIGURE_AWS" = true ]; then + echo "Configuring AWS credentials..." + aws configure +fi + +# Decrypt and import GPG private key if in import-only mode or not in sync-only mode +if [ -n "${ENCRYPTED_KEY_FILE:-}" ]; then + DECRYPTED_KEY_FILE="${ENCRYPTED_KEY_FILE%.*}" + echo "Decrypting GPG private key..." + gpg --decrypt --output "$DECRYPTED_KEY_FILE" "$ENCRYPTED_KEY_FILE" + + # Check if the key is already imported + if gpg --list-keys | grep -q "$GPG_KEY_ID"; then + echo "GPG key already imported." + else + gpg --import "$DECRYPTED_KEY_FILE" + fi + + # Securely delete the decrypted key file + shred -u "$DECRYPTED_KEY_FILE" + + # Exit if only importing GPG key + if [ "$IMPORT_GPG_KEY_ONLY" = true ]; then + echo "GPG key has been decrypted and imported successfully. Exiting." + exit 0 + fi +fi + +# Check access to the S3 bucket and perform sync only if needed +if [ "$IMPORT_GPG_KEY_ONLY" = false ] && [ "$S3_SYNC_ONLY" = false ] && [ "$COPY_NEW_RPM" = false ] && [ "$UPLOAD_WITH_DELETE" = false ]; then + if [ -z "${S3_BUCKET:-}" ]; then + echo "Error: S3 bucket (-s) is required." + exit 1 + fi + + echo "Checking access to S3 bucket $S3_BUCKET..." + if ! aws s3 ls "$S3_BUCKET" &> /dev/null; then + echo "Error: Unable to access S3 bucket $S3_BUCKET. Please check your AWS credentials and permissions." + exit 1 + fi + + # Sync the S3 repository to the local directory + mkdir -p "$LOCAL_DIR" + echo "Syncing S3 repository from $S3_BUCKET to $LOCAL_DIR..." + aws s3 sync "$S3_BUCKET" "$LOCAL_DIR" + + # Check if the operation is `s3-sync-only` + if [ "$S3_SYNC_ONLY" = true ]; then + echo "S3 sync operation completed successfully." + exit 0 + fi +fi + +# Copy the newly built RPM to the appropriate repository +if [ "$COPY_NEW_RPM" = true ]; then + echo "Identifying the newly built RPMs..." + + for ARCH in x86_64 noarch; do + RPM_DIR=~/rpmbuild/RPMS/$ARCH + + # Check if the RPM directory exists + if [ ! -d "$RPM_DIR" ]; then + echo "Warning: Directory $RPM_DIR does not exist. Skipping $ARCH." + continue + fi + + # Find all matching RPMs and copy them to the appropriate repository directory + NEW_RPMS=$(find "$RPM_DIR" -name "cloudberry-*.rpm" ! -name "*debuginfo*.rpm") + if [ -n "$NEW_RPMS" ]; then + for NEW_RPM in $NEW_RPMS; do + # Determine the repository (el8 or el9) based on the RPM filename + if echo "$NEW_RPM" | grep -q "\.el8\."; then + TARGET_REPO="$LOCAL_DIR/el8/$ARCH" + elif echo "$NEW_RPM" | grep -q "\.el9\."; then + TARGET_REPO="$LOCAL_DIR/el9/$ARCH" + else + echo "Error: Unable to determine the correct repository for $NEW_RPM. Exiting." + exit 1 + fi + + # Ensure the target repository directory exists + mkdir -p "$TARGET_REPO" + + # Copy the RPM to the target repository + echo "Copying $NEW_RPM to $TARGET_REPO..." + cp "$NEW_RPM" "$TARGET_REPO/" + echo "Copy operation completed." + done + else + echo "No matching RPMs found in $RPM_DIR." + fi + done +fi + +# Define the directories for `el8` and `el9` repositories +REPO_DIRS=("$LOCAL_DIR/el8/x86_64" "$LOCAL_DIR/el8/noarch" "$LOCAL_DIR/el9/x86_64" "$LOCAL_DIR/el9/noarch") + +# Traverse each repository directory (el8 and el9) and sign RPMs +for REPO_DIR in "${REPO_DIRS[@]}"; do + if [ -d "$REPO_DIR" ]; then + echo "Processing repository at $REPO_DIR..." + + # Export GPG public key for clients and place it in the root of the repository + TEMP_GPG_KEY=$(mktemp) + echo "Exporting GPG public key to temporary location..." + gpg --armor --export "$GPG_KEY_ID" > "$TEMP_GPG_KEY" + + # Import the GPG public key to RPM database + echo "Importing GPG public key into RPM database..." + sudo rpm --import "$TEMP_GPG_KEY" + + # Sign each RPM in the directory + echo "Signing RPM packages in $REPO_DIR..." + find "$REPO_DIR" -name "*.rpm" -exec rpm --addsign --define "_gpg_name $GPG_KEY_ID" {} \; + + # Verify that RPMs were signed successfully + echo "Verifying RPM signatures in $REPO_DIR..." + find "$REPO_DIR" -name "*.rpm" -exec rpm -Kv {} \; + + # Recreate the repository metadata + echo "Updating repository metadata in $REPO_DIR..." + createrepo --update "$REPO_DIR" + + # Sign the repository metadata, automatically overwriting if the file already exists + echo "Signing repository metadata in $REPO_DIR..." + gpg --batch --yes --detach-sign --armor --local-user "$GPG_KEY_ID" "$REPO_DIR/repodata/repomd.xml" + + # Copy the public key to each repo + cp "$TEMP_GPG_KEY" "$REPO_DIR/RPM-GPG-KEY-cloudberry" + + # Clean up temporary GPG key + rm -f "$TEMP_GPG_KEY" + else + echo "Warning: Repository directory $REPO_DIR does not exist. Skipping..." + fi +done + +# Upload changes to S3 with --delete option if requested +if [ "$UPLOAD_WITH_DELETE" = true ]; then + if [ -z "${S3_BUCKET:-}" ]; then + echo "Error: S3 bucket (-s) is required for upload with delete." + exit 1 + fi + + echo "Uploading local changes to S3 with --delete option..." + aws s3 sync "$LOCAL_DIR" "$S3_BUCKET" --delete + echo "S3 sync with --delete completed." +fi + +# Print completion message +echo "S3 repository sync, RPM signing, metadata signing, and public key export completed successfully." diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 8f314ee8ff4..a05f010b2a5 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -21,9 +21,10 @@ - The module provides the pg_buffercache_pages() - function, wrapped in the pg_buffercache view, and - the pg_buffercache_summary() function. + This module provides the pg_buffercache_pages() + function (wrapped in the pg_buffercache view), + the pg_buffercache_summary() function, and the + pg_buffercache_usage_counts() function. @@ -38,6 +39,12 @@ row summarizing the state of the shared buffer cache. + + The pg_buffercache_usage_counts() function returns a set + of records, each row describing the number of buffers with a given usage + count. + + By default, use is restricted to superusers and roles with privileges of the pg_monitor role. Access may be granted to others @@ -242,7 +249,7 @@ usagecount_avg float8 - Average usagecount of used shared buffers + Average usage count of used shared buffers @@ -266,6 +273,84 @@ + The <function>pg_buffercache_usage_counts()</function> Function + + + The definitions of the columns exposed by the function are shown in + . + + + + <function>pg_buffercache_usage_counts()</function> Output Columns + + + + + Column Type + + + Description + + + + + + + + usage_count int4 + + + A possible buffer usage count + + + + + + buffers int4 + + + Number of buffers with the usage count + + + + + + dirty int4 + + + Number of dirty buffers with the usage count + + + + + + pinned int4 + + + Number of pinned buffers with the usage count + + + + +
+ + + The pg_buffercache_usage_counts() function returns a + set of rows summarizing the states of all shared buffers, aggregated over + the possible usage count values. Similar and more detailed information is + provided by the pg_buffercache view, but + pg_buffercache_usage_counts() is significantly cheaper. + + + + Like the pg_buffercache view, + pg_buffercache_usage_counts() does not acquire buffer + manager locks. Therefore concurrent activity can lead to minor inaccuracies + in the result. + +
+ + Sample Output @@ -299,6 +384,18 @@ regression=# SELECT * FROM pg_buffercache_summary(); --------------+----------------+---------------+----------------+---------------- 248 | 2096904 | 39 | 0 | 3.141129 (1 row) + + +regression=# SELECT * FROM pg_buffercache_usage_counts(); + usage_count | buffers | dirty | pinned +-------------+---------+-------+-------- + 0 | 14650 | 0 | 0 + 1 | 1436 | 671 | 0 + 2 | 102 | 88 | 0 + 3 | 23 | 21 | 0 + 4 | 9 | 7 | 0 + 5 | 164 | 106 | 0 +(6 rows) diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index aa0dc14c6e6..c20cf1a5a6e 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -82,7 +82,7 @@ where storage_parameter is: APPENDONLY={TRUE|FALSE} BLOCKSIZE={8192-2097152} ORIENTATION={COLUMN|ROW} - COMPRESSTYPE={ZLIB|QUICKLZ|RLE_TYPE|NONE} + COMPRESSTYPE={ZLIB|RLE_TYPE|NONE} COMPRESSLEVEL={0-9} CHECKSUM={TRUE|FALSE} FILLFACTOR={10-100} @@ -173,7 +173,7 @@ and subpartition_element is: [ TABLESPACE tablespace ] where storage_directive is: - COMPRESSTYPE={ZLIB | QUICKLZ | RLE_TYPE | NONE} + COMPRESSTYPE={ZLIB | RLE_TYPE | NONE} | COMPRESSLEVEL={0-9} | BLOCKSIZE={8192-2097152} diff --git a/doc/src/sgml/ref/create_table_as.sgml b/doc/src/sgml/ref/create_table_as.sgml index fc2df51772e..936184ed83e 100644 --- a/doc/src/sgml/ref/create_table_as.sgml +++ b/doc/src/sgml/ref/create_table_as.sgml @@ -34,7 +34,7 @@ where storage_parameter is: APPENDONLY={TRUE|FALSE} BLOCKSIZE={8192-2097152} ORIENTATION={COLUMN|ROW} - COMPRESSTYPE={ZLIB|QUICKLZ|RLE_TYPE|NONE} + COMPRESSTYPE={ZLIB|RLE_TYPE|NONE} COMPRESSLEVEL={0-9} CHECKSUM={TRUE|FALSE} FILLFACTOR={10-100} diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 1dea9341f53..ca6ff8cdc65 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -863,16 +863,6 @@ PostgreSQL documentation and the two systems have different definitions of the collation used to sort the partitioning column. - - - It is best not to use parallelism when restoring from an archive made - with this option, because pg_restore will - not know exactly which partition(s) a given archive data item will - load data into. This could result in inefficiency due to lock - conflicts between parallel jobs, or perhaps even reload failures due - to foreign key constraints being set up before all the relevant data - is loaded. - diff --git a/doc/src/sgml/ref/pg_dumpall.sgml b/doc/src/sgml/ref/pg_dumpall.sgml index 1181e182b1a..5bde886c453 100644 --- a/doc/src/sgml/ref/pg_dumpall.sgml +++ b/doc/src/sgml/ref/pg_dumpall.sgml @@ -395,10 +395,6 @@ PostgreSQL documentation and the two systems have different definitions of the collation used to sort the partitioning column. - - diff --git a/getversion b/getversion index 76f6984ada5..b4c29112651 100755 --- a/getversion +++ b/getversion @@ -94,7 +94,7 @@ generate_dev_version() { } # Check if we're in a Git repo and git is available -if type git >/dev/null 2>&1 && [ -d '.git' ]; then +if type git >/dev/null 2>&1 && [ -e '.git' ]; then # Ensure git describe doesn't fail due to shallow clone if git describe --tags --long >/dev/null 2>&1; then if git describe --exact-match >/dev/null 2>&1; then diff --git a/gpAux/Makefile b/gpAux/Makefile index 6502cffd0c4..dcdc2fc5f5e 100644 --- a/gpAux/Makefile +++ b/gpAux/Makefile @@ -124,13 +124,13 @@ DEFPORT=5432 ORCA_CONFIG=--enable-orca -rhel6_x86_64_CONFIGFLAGS=--with-quicklz --with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext -rhel7_x86_64_CONFIGFLAGS=--with-quicklz --with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext -rhel8_x86_64_CONFIGFLAGS=--with-quicklz --with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext -ubuntu18.04_x86_64_CONFIGFLAGS=--with-quicklz --with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext -sles12_x86_64_CONFIGFLAGS=--with-quicklz --with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext -rhel7_aarch64_CONFIGFLAGS=--with-quicklz --with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext -common_CONFIGFLAGS=--with-quicklz --with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext +rhel6_x86_64_CONFIGFLAGS=--with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext +rhel7_x86_64_CONFIGFLAGS=--with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext +rhel8_x86_64_CONFIGFLAGS=--with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext +ubuntu18.04_x86_64_CONFIGFLAGS=--with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext +sles12_x86_64_CONFIGFLAGS=--with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext +rhel7_aarch64_CONFIGFLAGS=--with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext +common_CONFIGFLAGS=--with-lz4 --with-gssapi --enable-mapreduce --enable-orafce --enable-ic-proxy ${ORCA_CONFIG} --enable-gpcloud --with-libxml --with-ssl=openssl --with-pam --with-ldap --with-pythonsrc-ext #BLD_CONFIGFLAGS=$($(BLD_ARCH)_CONFIGFLAGS) BLD_CONFIGFLAGS=$(common_CONFIGFLAGS) @@ -221,7 +221,6 @@ define BUILD_STEPS cd $(BUILDDIR) && PYGRESQL_LDFLAGS=' -Wl,-rpath,\$$$$ORIGIN/.. -Wl,--enable-new-dtags ' $(MAKE) $(PARALLEL_MAKE_OPTS) install cd $(BUILDDIR)/src/pl/plpython && $(MAKE) clean && $(MAKE) $(PARALLEL_MAKE_OPTS) install && cd $(BUILDDIR) cd $(BUILDDIR)/src/pl/plperl && $(MAKE) clean && echo "LDFLAGS += -Wl,-rpath,$(perl_archlibexp)/CORE -Wl,--enable-new-dtags" >> GNUmakefile && echo "LDFLAGS_SL += -Wl,-rpath,$(perl_archlibexp)/CORE -Wl,--enable-new-dtags" >> GNUmakefile && $(MAKE) $(PARALLEL_MAKE_OPTS) install && cd $(BUILDDIR) - #@$(MAKE) greenplum_path INSTLOC=$(INSTLOC) #@$(MAKE) mgmtcopy INSTLOC=$(INSTLOC) @$(MAKE) copylibs INSTLOC=$(INSTLOC) @$(MAKE) clients INSTLOC=$(INSTLOC) CLIENTSINSTLOC=$(CLIENTSINSTLOC) @@ -233,7 +232,7 @@ endef ifeq "$(BLD_GPDB_BUILDSET)" "partial" define BUILD_STEPS rm -rf $(INSTLOC) - cd $(BUILDDIR)/gpMgmt/ && $(MAKE) generate_greenplum_path_file + cd $(BUILDDIR)/gpMgmt/ && $(MAKE) generate_cloudberry_env_file cd $(BUILDDIR)/src/backend/ && $(MAKE) ../../src/include/parser/gram.h cd $(BUILDDIR)/src/backend/ && $(MAKE) ../../src/include/utils/errcodes.h cd $(BUILDDIR)/src/backend/ && $(MAKE) ../../src/include/utils/fmgroids.h diff --git a/gpAux/client/install/src/windows/CreatePackage.bat b/gpAux/client/install/src/windows/CreatePackage.bat index 99079b11cf1..04bd76b52b7 100644 --- a/gpAux/client/install/src/windows/CreatePackage.bat +++ b/gpAux/client/install/src/windows/CreatePackage.bat @@ -3,7 +3,7 @@ set VERSION=%2 echo %VERSION% > %GPDB_INSTALL_PATH%\VERSION copy ..\..\..\..\..\NOTICE %GPDB_INSTALL_PATH% copy ..\..\..\..\..\LICENSE %GPDB_INSTALL_PATH% -copy ..\..\..\scripts\greenplum_clients_path.bat %GPDB_INSTALL_PATH% +copy ..\..\..\scripts\cloudberry_clients_path.bat %GPDB_INSTALL_PATH% mkdir %GPDB_INSTALL_PATH%\lib\python\yaml copy ..\..\..\..\..\gpMgmt\bin\gpload.py %GPDB_INSTALL_PATH%\bin mkdir %GPDB_INSTALL_PATH%\bin\gppylib @@ -15,4 +15,4 @@ REM Install PyYAML using pip instead of extracting from tarball pip3 install --target=%GPDB_INSTALL_PATH%\lib\python PyYAML==5.4.1 perl -p -e "s,__VERSION_PLACEHOLDER__,%VERSION%," greenplum-clients.wxs > greenplum-clients-%VERSION%.wxs candle.exe -nologo greenplum-clients-%VERSION%.wxs -out greenplum-clients-%VERSION%.wixobj -dSRCDIR=%GPDB_INSTALL_PATH% -dVERSION=%VERSION% -light.exe -nologo -sval greenplum-clients-%VERSION%.wixobj -out greenplum-clients-x86_64.msi \ No newline at end of file +light.exe -nologo -sval greenplum-clients-%VERSION%.wixobj -out greenplum-clients-x86_64.msi diff --git a/gpAux/client/install/src/windows/greenplum-clients.wxs b/gpAux/client/install/src/windows/greenplum-clients.wxs index 0f17c299112..4e8f0128d8c 100755 --- a/gpAux/client/install/src/windows/greenplum-clients.wxs +++ b/gpAux/client/install/src/windows/greenplum-clients.wxs @@ -1100,7 +1100,7 @@ If you want to review or change any of your installation settings, click Back. C - + diff --git a/gpAux/client/scripts/greenplum_clients_path.bat b/gpAux/client/scripts/cloudberry_clients_path.bat similarity index 100% rename from gpAux/client/scripts/greenplum_clients_path.bat rename to gpAux/client/scripts/cloudberry_clients_path.bat diff --git a/gpAux/client/scripts/greenplum_clients_path.sh b/gpAux/client/scripts/cloudberry_clients_path.sh similarity index 60% rename from gpAux/client/scripts/greenplum_clients_path.sh rename to gpAux/client/scripts/cloudberry_clients_path.sh index 53dd4a286aa..f80ee53ed3f 100644 --- a/gpAux/client/scripts/greenplum_clients_path.sh +++ b/gpAux/client/scripts/cloudberry_clients_path.sh @@ -1,18 +1,3 @@ -# -------------------------------------------------------------------- -# NOTICE from the Apache Cloudberry PPMC -# -------------------------------------------------------------------- -# This file uses the term 'greenplum' to maintain compatibility with -# earlier versions of Apache Cloudberry, which was originally called -# Greenplum. This usage does not refer to VMware Tanzu Greenplum, -# nor does it imply that Apache Cloudberry (Incubating) is affiliated -# with, endorsed by, or sponsored by Broadcom Inc. -# -# This file will be renamed in a future Apache Cloudberry release to -# ensure compliance with Apache Software Foundation guidelines. -# We will announce the change on the project mailing list and website. -# -# See: https://lists.apache.org/thread/b8o974mnnqk6zpy86dgll2pgqcvqgnwm -# -------------------------------------------------------------------- if test -n "${ZSH_VERSION:-}"; then # zsh diff --git a/gpAux/gpdemo/Makefile b/gpAux/gpdemo/Makefile index b6e794e57ab..386876b9b87 100644 --- a/gpAux/gpdemo/Makefile +++ b/gpAux/gpdemo/Makefile @@ -9,7 +9,7 @@ top_builddir = ../.. -include $(top_builddir)/src/Makefile.global -SHELL := /usr/bin/bash +SHELL := /usr/bin/env bash all: $(MAKE) clean diff --git a/gpAux/gpdemo/README b/gpAux/gpdemo/README index 314de0d60b3..79e976fbd83 100644 --- a/gpAux/gpdemo/README +++ b/gpAux/gpdemo/README @@ -31,13 +31,13 @@ RUNNING GP DEMO su - gpadmin -2. Source greenplum_path.sh +2. Source cloudberry-env.sh - . /usr/local/gpdb/greenplum_path.sh + . /usr/local/cloudberry-db/cloudberry-env.sh Note: There is a space between the dot and the slash. - The "." will source the greenplum_path.sh into the current + The "." will source the cloudberry-env.sh into the current shell, instead of starting a subprocess. 3. Create the cluster at the current directory: diff --git a/gpAux/gpdemo/demo_cluster.sh b/gpAux/gpdemo/demo_cluster.sh index 8293fc687db..225bb76a5ee 100755 --- a/gpAux/gpdemo/demo_cluster.sh +++ b/gpAux/gpdemo/demo_cluster.sh @@ -120,7 +120,7 @@ cleanDemo(){ ## (export COORDINATOR_DATA_DIRECTORY=$QDDIR/${SEG_PREFIX}-1; - source ${GPHOME}/greenplum_path.sh; + source ${GPHOME}/cloudberry-env.sh; gpstop -ai) ## @@ -180,7 +180,7 @@ done if [ -z "${GPHOME}" ]; then echo "FATAL: The GPHOME environment variable is not set." echo "" - echo " You can set it by sourcing the greenplum_path.sh" + echo " You can set it by sourcing the cloudberry-env.sh" echo " file in your Cloudberry installation directory." echo "" exit 1 diff --git a/gpAux/releng/gppkg.mk b/gpAux/releng/gppkg.mk index 7706e7f4d8e..e93fcd1f74c 100644 --- a/gpAux/releng/gppkg.mk +++ b/gpAux/releng/gppkg.mk @@ -62,7 +62,7 @@ ifdef DEPENDENT_RPMS cp $${dep_rpm} gppkg/deps; \ done endif - source $(INSTLOC)/greenplum_path.sh && gppkg --build gppkg + source $(INSTLOC)/cloudberry-env.sh && gppkg --build gppkg rm -rf gppkg clean: @@ -74,6 +74,6 @@ ifdef EXTRA_CLEAN endif install: $(TARGET_GPPKG) - source $(INSTLOC)/greenplum_path.sh && gppkg -i $(TARGET_GPPKG) + source $(INSTLOC)/cloudberry-env.sh && gppkg -i $(TARGET_GPPKG) .PHONY: install clean diff --git a/gpMgmt/Makefile b/gpMgmt/Makefile index 372dae8ed46..60abfd04cb3 100644 --- a/gpMgmt/Makefile +++ b/gpMgmt/Makefile @@ -6,12 +6,12 @@ SUBDIRS= sbin bin doc $(recurse) -generate_greenplum_path_file: +generate_cloudberry_env_file: mkdir -p $(DESTDIR)$(prefix) unset LIBPATH; \ - bin/generate-greenplum-path.sh > $(DESTDIR)$(prefix)/greenplum_path.sh + bin/generate-cloudberry-env.sh > $(DESTDIR)$(prefix)/cloudberry-env.sh -install: generate_greenplum_path_file +install: generate_cloudberry_env_file mkdir -p $(DESTDIR)$(prefix)/lib/python # Setup /lib/python contents diff --git a/gpMgmt/bin/Makefile b/gpMgmt/bin/Makefile index 3c4b6d2b031..c5eb6ccba9c 100644 --- a/gpMgmt/bin/Makefile +++ b/gpMgmt/bin/Makefile @@ -90,12 +90,27 @@ PYYAML_VERSION=5.4.1 download-python-deps: @echo "--- Downloading Python dependencies for gpMgmt modules" @mkdir -p $(PYLIB_SRC_EXT) - # Download psutil using curl - curl -sSL https://files.pythonhosted.org/packages/source/p/psutil/psutil-$(PSUTIL_VERSION).tar.gz -o $(PYLIB_SRC_EXT)/psutil-$(PSUTIL_VERSION).tar.gz - # Download PyYAML using curl - curl -sSL https://files.pythonhosted.org/packages/source/P/PyYAML/PyYAML-$(PYYAML_VERSION).tar.gz -o $(PYLIB_SRC_EXT)/PyYAML-$(PYYAML_VERSION).tar.gz - # Download PyGreSQL using curl - curl -sSL https://files.pythonhosted.org/packages/source/P/PyGreSQL/PyGreSQL-$(PYGRESQL_VERSION).tar.gz -o $(PYLIB_SRC_EXT)/PyGreSQL-$(PYGRESQL_VERSION).tar.gz + # Download psutil using curl (only if not exists) + @if [ ! -f $(PYLIB_SRC_EXT)/psutil-$(PSUTIL_VERSION).tar.gz ]; then \ + echo "Downloading psutil-$(PSUTIL_VERSION).tar.gz..."; \ + curl -sSL https://files.pythonhosted.org/packages/source/p/psutil/psutil-$(PSUTIL_VERSION).tar.gz -o $(PYLIB_SRC_EXT)/psutil-$(PSUTIL_VERSION).tar.gz; \ + else \ + echo "psutil-$(PSUTIL_VERSION).tar.gz already exists, skipping download"; \ + fi + # Download PyYAML using curl (only if not exists) + @if [ ! -f $(PYLIB_SRC_EXT)/PyYAML-$(PYYAML_VERSION).tar.gz ]; then \ + echo "Downloading PyYAML-$(PYYAML_VERSION).tar.gz..."; \ + curl -sSL https://files.pythonhosted.org/packages/source/P/PyYAML/PyYAML-$(PYYAML_VERSION).tar.gz -o $(PYLIB_SRC_EXT)/PyYAML-$(PYYAML_VERSION).tar.gz; \ + else \ + echo "PyYAML-$(PYYAML_VERSION).tar.gz already exists, skipping download"; \ + fi + # Download PyGreSQL using curl (only if not exists) + @if [ ! -f $(PYLIB_SRC_EXT)/PyGreSQL-$(PYGRESQL_VERSION).tar.gz ]; then \ + echo "Downloading PyGreSQL-$(PYGRESQL_VERSION).tar.gz..."; \ + curl -sSL https://files.pythonhosted.org/packages/source/P/PyGreSQL/PyGreSQL-$(PYGRESQL_VERSION).tar.gz -o $(PYLIB_SRC_EXT)/PyGreSQL-$(PYGRESQL_VERSION).tar.gz; \ + else \ + echo "PyGreSQL-$(PYGRESQL_VERSION).tar.gz already exists, skipping download"; \ + fi # Install wheel and cython for PyYAML building pip3 install --user wheel "cython<3.0.0" @@ -150,7 +165,12 @@ $(MOCK_BIN): pip3 install mock;\ else\ mkdir -p $(PYLIB_SRC_EXT) && \ - curl -sSL https://files.pythonhosted.org/packages/source/m/mock/mock-$(MOCK_VERSION).zip -o $(PYLIB_SRC_EXT)/mock-$(MOCK_VERSION).zip && \ + if [ ! -f $(PYLIB_SRC_EXT)/mock-$(MOCK_VERSION).zip ]; then \ + echo "Downloading mock-$(MOCK_VERSION).zip..."; \ + curl -sSL https://files.pythonhosted.org/packages/source/m/mock/mock-$(MOCK_VERSION).zip -o $(PYLIB_SRC_EXT)/mock-$(MOCK_VERSION).zip; \ + else \ + echo "mock-$(MOCK_VERSION).zip already exists, skipping download"; \ + fi && \ mkdir -p $(PYTHONSRC_INSTALL_SITE) && \ cd $(PYLIB_SRC_EXT)/ && unzip -q $(MOCK_DIR).zip && \ cd $(PYLIB_SRC_EXT)/$(MOCK_DIR)/ && \ diff --git a/gpMgmt/bin/README.md b/gpMgmt/bin/README.md index e28809bd86e..3dae8248293 100644 --- a/gpMgmt/bin/README.md +++ b/gpMgmt/bin/README.md @@ -12,7 +12,7 @@ To run any of these python scripts, necessary libraries must be installed, and P PYTHONPATH="\$GPHOME/lib/python:${PYTHONPATH}" ``` -This will be set automatically with a `source $GPHOME/greenplum_path.sh` +This will be set automatically with a `source $GPHOME/cloudberry-env.sh` ## Python Version diff --git a/gpMgmt/bin/analyzedb b/gpMgmt/bin/analyzedb index cc51e265927..48d8e16872c 100755 --- a/gpMgmt/bin/analyzedb +++ b/gpMgmt/bin/analyzedb @@ -37,7 +37,7 @@ try: from gppylib.operations.unix import CheckDir, CheckFile, MakeDir except ImportError as e: - sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) + sys.exit('Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(e)) EXECNAME = 'analyzedb' STATEFILE_DIR = 'db_analyze' diff --git a/gpMgmt/bin/generate-cloudberry-env.sh b/gpMgmt/bin/generate-cloudberry-env.sh new file mode 100755 index 00000000000..7f1f9074efc --- /dev/null +++ b/gpMgmt/bin/generate-cloudberry-env.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +cat <<"EOF" +if test -n "${ZSH_VERSION:-}"; then + # zsh + SCRIPT_PATH="${(%):-%x}" +elif test -n "${BASH_VERSION:-}"; then + # bash + SCRIPT_PATH="${BASH_SOURCE[0]}" +else + # Unknown shell, hope below works. + # Tested with dash + result=$(lsof -p $$ -Fn | tail --lines=1 | xargs --max-args=2 | cut --delimiter=' ' --fields=2) + SCRIPT_PATH=${result#n} +fi + +if test -z "$SCRIPT_PATH"; then + echo "The shell cannot be identified. \$GPHOME may not be set correctly." >&2 +fi +SCRIPT_DIR="$(cd "$(dirname "${SCRIPT_PATH}")" >/dev/null 2>&1 && pwd)" + +if [ ! -L "${SCRIPT_DIR}" ]; then + GPHOME=${SCRIPT_DIR} +else + GPHOME=$(readlink "${SCRIPT_DIR}") +fi +EOF + +cat <<"EOF" +PYTHONPATH="${GPHOME}/lib/python" +PATH="${GPHOME}/bin:${PATH}" +LD_LIBRARY_PATH="${GPHOME}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" + +if [ -e "${GPHOME}/etc/openssl.cnf" ]; then + OPENSSL_CONF="${GPHOME}/etc/openssl.cnf" +fi + +#setup JAVA_HOME +if [ -x "${GPHOME}/ext/jdk/bin/java" ]; then + JAVA_HOME="${GPHOME}/ext/jdk" + PATH="${JAVA_HOME}/bin:${PATH}" + CLASSPATH=${JAVA_HOME}/lib/dt.jar:${JAVA_HOME}/lib/tool.jar +fi + +export GPHOME +export PATH +export PYTHONPATH +export LD_LIBRARY_PATH +export OPENSSL_CONF +export JAVA_HOME +export CLASSPATH + +# Load the external environment variable files +if [ -d "${GPHOME}/etc/environment.d" ]; then + LOGGER=$(which logger 2> /dev/null || which true) + set -o allexport + for env in $(find "${GPHOME}/etc/environment.d" -regextype sed -regex '.*\/[0-9][0-9]-.*\.conf$' -type f | sort -n); do + $LOGGER -t "greenplum-path.sh" "loading environment from ${env}" + source "${env}" + done + set +o allexport +fi +EOF diff --git a/gpMgmt/bin/generate-greenplum-path.sh b/gpMgmt/bin/generate-greenplum-path.sh deleted file mode 100755 index 5a3b6e35698..00000000000 --- a/gpMgmt/bin/generate-greenplum-path.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env bash -# -------------------------------------------------------------------- -# NOTICE from the Apache Cloudberry PPMC -# -------------------------------------------------------------------- -# This file uses the term 'greenplum' to maintain compatibility with -# earlier versions of Apache Cloudberry, which was originally called -# Greenplum. This usage does not refer to VMware Tanzu Greenplum, -# nor does it imply that Apache Cloudberry (Incubating) is affiliated -# with, endorsed by, or sponsored by Broadcom Inc. -# -# This file will be renamed in a future Apache Cloudberry release to -# ensure compliance with Apache Software Foundation guidelines. -# We will announce the change on the project mailing list and website. -# -# See: https://lists.apache.org/thread/b8o974mnnqk6zpy86dgll2pgqcvqgnwm -# -------------------------------------------------------------------- - -cat <<"EOF" -if [ -n "${PS1-}" ]; then - echo " -# -------------------------------------------------------------------- -# NOTICE from the Apache Cloudberry PPMC -# -------------------------------------------------------------------- -# This file uses the term 'greenplum' to maintain compatibility with -# earlier versions of Apache Cloudberry, which was originally called -# Greenplum. This usage does not refer to VMware Tanzu Greenplum, -# nor does it imply that Apache Cloudberry (Incubating) is affiliated -# with, endorsed by, or sponsored by Broadcom Inc. -# -# This file will be renamed in a future Apache Cloudberry release to -# ensure compliance with Apache Software Foundation guidelines. -# We will announce the change on the project mailing list and website. -# -# See: https://lists.apache.org/thread/b8o974mnnqk6zpy86dgll2pgqcvqgnwm -# -------------------------------------------------------------------- -" -fi -EOF - -cat <<"EOF" -if test -n "${ZSH_VERSION:-}"; then - # zsh - SCRIPT_PATH="${(%):-%x}" -elif test -n "${BASH_VERSION:-}"; then - # bash - SCRIPT_PATH="${BASH_SOURCE[0]}" -else - # Unknown shell, hope below works. - # Tested with dash - result=$(lsof -p $$ -Fn | tail --lines=1 | xargs --max-args=2 | cut --delimiter=' ' --fields=2) - SCRIPT_PATH=${result#n} -fi - -if test -z "$SCRIPT_PATH"; then - echo "The shell cannot be identified. \$GPHOME may not be set correctly." >&2 -fi -SCRIPT_DIR="$(cd "$(dirname "${SCRIPT_PATH}")" >/dev/null 2>&1 && pwd)" - -if [ ! -L "${SCRIPT_DIR}" ]; then - GPHOME=${SCRIPT_DIR} -else - GPHOME=$(readlink "${SCRIPT_DIR}") -fi -EOF - -cat <<"EOF" -PYTHONPATH="${GPHOME}/lib/python" -PATH="${GPHOME}/bin:${PATH}" -LD_LIBRARY_PATH="${GPHOME}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" - -if [ -e "${GPHOME}/etc/openssl.cnf" ]; then - OPENSSL_CONF="${GPHOME}/etc/openssl.cnf" -fi - -#setup JAVA_HOME -if [ -x "${GPHOME}/ext/jdk/bin/java" ]; then - JAVA_HOME="${GPHOME}/ext/jdk" - PATH="${JAVA_HOME}/bin:${PATH}" - CLASSPATH=${JAVA_HOME}/lib/dt.jar:${JAVA_HOME}/lib/tool.jar -fi - -export GPHOME -export PATH -export PYTHONPATH -export LD_LIBRARY_PATH -export OPENSSL_CONF -export JAVA_HOME -export CLASSPATH - -# Load the external environment variable files -if [ -d "${GPHOME}/etc/environment.d" ]; then - LOGGER=$(which logger 2> /dev/null || which true) - set -o allexport - for env in $(find "${GPHOME}/etc/environment.d" -regextype sed -regex '.*\/[0-9][0-9]-.*\.conf$' -type f | sort -n); do - $LOGGER -t "greenplum-path.sh" "loading environment from ${env}" - source "${env}" - done - set +o allexport -fi -EOF diff --git a/gpMgmt/bin/gpactivatestandby b/gpMgmt/bin/gpactivatestandby index 4c2069f0970..020ead515f3 100755 --- a/gpMgmt/bin/gpactivatestandby +++ b/gpMgmt/bin/gpactivatestandby @@ -33,7 +33,7 @@ try: from gppylib.userinput import ask_yesno except Exception as e: sys.exit('ERROR: Cannot import modules. Please check that you ' - 'have sourced greenplum_path.sh. Detail: ' + str(e)) + 'have sourced cloudberry-env.sh. Detail: ' + str(e)) EXECNAME = os.path.split(__file__)[-1] diff --git a/gpMgmt/bin/gpcheckresgroupv2impl b/gpMgmt/bin/gpcheckresgroupv2impl index 4e15d048eb1..e52931f7a1f 100755 --- a/gpMgmt/bin/gpcheckresgroupv2impl +++ b/gpMgmt/bin/gpcheckresgroupv2impl @@ -2,6 +2,8 @@ # Copyright (c) 2017, VMware, Inc. or its affiliates. import os +import sys +import argparse from functools import reduce @@ -26,9 +28,10 @@ class CgroupValidation(object): class CgroupValidationVersionTwo(CgroupValidation): - def __init__(self): + def __init__(self, cgroup_parent=None): self.mount_point = self.detect_cgroup_mount_point() self.tab = {"r": os.R_OK, "w": os.W_OK, "x": os.X_OK, "f": os.F_OK} + self.cgroup_parent = cgroup_parent if cgroup_parent else "gpdb.service" def validate_all(self): """ @@ -43,23 +46,23 @@ class CgroupValidationVersionTwo(CgroupValidation): self.validate_permission("cgroup.procs", "rw") - self.validate_permission("gpdb/", "rwx") - self.validate_permission("gpdb/cgroup.procs", "rw") + self.validate_permission(self.cgroup_parent + "/", "rwx") + self.validate_permission(self.cgroup_parent + "/cgroup.procs", "rw") - self.validate_permission("gpdb/cpu.max", "rw") - self.validate_permission("gpdb/cpu.weight", "rw") - self.validate_permission("gpdb/cpu.weight.nice", "rw") - self.validate_permission("gpdb/cpu.stat", "r") + self.validate_permission(self.cgroup_parent + "/cpu.max", "rw") + self.validate_permission(self.cgroup_parent + "/cpu.weight", "rw") + self.validate_permission(self.cgroup_parent + "/cpu.weight.nice", "rw") + self.validate_permission(self.cgroup_parent + "/cpu.stat", "r") - self.validate_permission("gpdb/cpuset.cpus", "rw") - self.validate_permission("gpdb/cpuset.cpus.partition", "rw") - self.validate_permission("gpdb/cpuset.mems", "rw") - self.validate_permission("gpdb/cpuset.cpus.effective", "r") - self.validate_permission("gpdb/cpuset.mems.effective", "r") + self.validate_permission(self.cgroup_parent + "/cpuset.cpus", "rw") + self.validate_permission(self.cgroup_parent + "/cpuset.cpus.partition", "rw") + self.validate_permission(self.cgroup_parent + "/cpuset.mems", "rw") + self.validate_permission(self.cgroup_parent + "/cpuset.cpus.effective", "r") + self.validate_permission(self.cgroup_parent + "/cpuset.mems.effective", "r") - self.validate_permission("gpdb/memory.current", "r") + self.validate_permission(self.cgroup_parent + "/memory.current", "r") - self.validate_permission("gpdb/io.max", "rw") + self.validate_permission(self.cgroup_parent + "/io.max", "rw") def die(self, msg): raise ValidationException("cgroup is not properly configured: {}".format(msg)) @@ -85,7 +88,15 @@ class CgroupValidationVersionTwo(CgroupValidation): if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Validate cgroup v2 configuration for resource groups') + parser.add_argument('--cgroup-parent', + dest='cgroup_parent', + default=None, + help='The cgroup parent directory name (gp_resource_group_cgroup_parent value)') + + args = parser.parse_args() + try: - CgroupValidationVersionTwo().validate_all() + CgroupValidationVersionTwo(cgroup_parent=args.cgroup_parent).validate_all() except ValidationException as e: exit(e.message) diff --git a/gpMgmt/bin/gpconfig b/gpMgmt/bin/gpconfig index 7bd3023ea85..f4e3ce7c62d 100755 --- a/gpMgmt/bin/gpconfig +++ b/gpMgmt/bin/gpconfig @@ -15,6 +15,7 @@ import os import sys import re +import psutil try: from gppylib.gpparseopts import OptParser, OptChecker @@ -34,7 +35,7 @@ try: from gpconfig_modules.parse_guc_metadata import ParseGuc except ImportError as err: sys.exit('Cannot import modules. Please check that you have sourced ' - 'greenplum_path.sh. Detail: ' + str(err)) + 'cloudberry-env.sh. Detail: ' + str(err)) EXECNAME = os.path.split(__file__)[-1] @@ -170,6 +171,20 @@ class Guc: return msg elif newval != "'queue'": return "the value of gp_resource_manager must be 'group' or 'group-v2' or 'queue'" + elif self.name == "gp_resource_group_cgroup_parent": + base = newval.strip("'") + if not re.match("^[0-9a-zA-Z][-._0-9a-zA-Z]*$", base): + return "resource group cgroup parent can only contains alphabet, number, and non-leading . _ -" + + path = None + for partition in psutil.disk_partitions(all=True): + if partition.fstype == "cgroup2": + path = partition.mountpoint + "/" + base + break + if path is None: + return "cannot find cgroup v2 mountpoint" + if not os.path.isdir(path): + return "'%s' doesn't exists or is not a directory" % path elif self.name == 'unix_socket_permissions': if newval[0] != '0': diff --git a/gpMgmt/bin/gpdeletesystem b/gpMgmt/bin/gpdeletesystem index 8299d86d820..3730d2758fe 100755 --- a/gpMgmt/bin/gpdeletesystem +++ b/gpMgmt/bin/gpdeletesystem @@ -24,7 +24,7 @@ try: from gppylib.operations.segment_tablespace_locations import get_tablespace_locations except ImportError as e: sys.exit('ERROR: Cannot import modules. Please check that you ' - 'have sourced greenplum_path.sh. Detail: ' + str(e)) + 'have sourced cloudberry-env.sh. Detail: ' + str(e)) EXECNAME = os.path.split(__file__)[-1] diff --git a/gpMgmt/bin/gpdirtableload b/gpMgmt/bin/gpdirtableload index 2ad6933059d..72f6dac9ca0 100755 --- a/gpMgmt/bin/gpdirtableload +++ b/gpMgmt/bin/gpdirtableload @@ -420,7 +420,7 @@ class gpdirtableload: 'greenplum_loaders_path.sh') elif os.environ.get('GPHOME'): srcfile = os.path.join(os.environ.get('GPHOME'), - 'greenplum_path.sh') + 'cloudberry-env.sh') if (not (srcfile and os.path.exists(srcfile))): self.log(self.ERROR, 'cannot find cloudberry environment ' + 'file: environment misconfigured') @@ -488,7 +488,7 @@ class gpdirtableload: 'greenplum_loaders_path.sh') elif os.environ.get('GPHOME'): srcfile = os.path.join(os.environ.get('GPHOME'), - 'greenplum_path.sh') + 'cloudberry-env.sh') if (not (srcfile and os.path.exists(srcfile))): self.log(self.ERROR, 'cannot find cloudberry environment ' + 'file: environment misconfigured') diff --git a/gpMgmt/bin/gpexpand b/gpMgmt/bin/gpexpand index 562fad58213..dcbed918a19 100755 --- a/gpMgmt/bin/gpexpand +++ b/gpMgmt/bin/gpexpand @@ -44,7 +44,7 @@ try: from gppylib.operations.update_pg_hba_on_segments import update_pg_hba_on_segments except ImportError as e: - sys.exit('ERROR: Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) + sys.exit('ERROR: Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(e)) # constants MAX_PARALLEL_EXPANDS = 96 diff --git a/gpMgmt/bin/gpinitstandby b/gpMgmt/bin/gpinitstandby index 3e0ee7516bf..ad14c04946e 100755 --- a/gpMgmt/bin/gpinitstandby +++ b/gpMgmt/bin/gpinitstandby @@ -22,7 +22,7 @@ try: from gppylib.commands.pg import PgBaseBackup except ImportError as e: sys.exit('ERROR: Cannot import modules. Please check that you ' - 'have sourced greenplum_path.sh. Detail: ' + str(e)) + 'have sourced cloudberry-env.sh. Detail: ' + str(e)) EXECNAME = os.path.split(__file__)[-1] diff --git a/gpMgmt/bin/gpinitsystem b/gpMgmt/bin/gpinitsystem index 1ce4132e53d..fa85d42ae3f 100755 --- a/gpMgmt/bin/gpinitsystem +++ b/gpMgmt/bin/gpinitsystem @@ -2271,7 +2271,7 @@ LOG_MSG "[INFO]:-Apache Cloudberry instance successfully created" 1 LOG_MSG "[INFO]:-------------------------------------------------------" 1 LOG_MSG "[INFO]:-To complete the environment configuration, please " 1 LOG_MSG "[INFO]:-update $USER_NAME .bashrc file with the following" 1 -LOG_MSG "[INFO]:-1. Ensure that the greenplum_path.sh file is sourced" 1 +LOG_MSG "[INFO]:-1. Ensure that the cloudberry-env.sh file is sourced" 1 LOG_MSG "[INFO]:-2. Add \"export COORDINATOR_DATA_DIRECTORY=${COORDINATOR_DIRECTORY}/${SEG_PREFIX}-1\"" 1 LOG_MSG "[INFO]:- to access the Cloudberry scripts for this instance:" 1 LOG_MSG "[INFO]:- or, use -d ${COORDINATOR_DIRECTORY}/${SEG_PREFIX}-1 option for the Cloudberry scripts" 1 diff --git a/gpMgmt/bin/gpload b/gpMgmt/bin/gpload index 33a0ec0e9f3..fb945286fff 100755 --- a/gpMgmt/bin/gpload +++ b/gpMgmt/bin/gpload @@ -1,6 +1,6 @@ #!/usr/bin/env bash if [ ! -z "$GPHOME" ]; then - . $GPHOME/greenplum_path.sh + . $GPHOME/cloudberry-env.sh fi if [ ! -z "$GPHOME_LOADERS" ]; then . $GPHOME_LOADERS/greenplum_loaders_path.sh diff --git a/gpMgmt/bin/gpload.py b/gpMgmt/bin/gpload.py index a1696947bbc..82a0a571166 100755 --- a/gpMgmt/bin/gpload.py +++ b/gpMgmt/bin/gpload.py @@ -1654,7 +1654,7 @@ def start_gpfdists(self): 'greenplum_loaders_path.sh') elif os.environ.get('GPHOME'): srcfile = os.path.join(os.environ.get('GPHOME'), - 'greenplum_path.sh') + 'cloudberry-env.sh') if (not (srcfile and os.path.exists(srcfile))): self.log(self.ERROR, 'cannot find cloudberry environment ' + diff --git a/gpMgmt/bin/gpload_test/gpload2/README b/gpMgmt/bin/gpload_test/gpload2/README index 588812f82a2..b29c28771a0 100644 --- a/gpMgmt/bin/gpload_test/gpload2/README +++ b/gpMgmt/bin/gpload_test/gpload2/README @@ -2,7 +2,7 @@ The remote tests is intended to be run for remote server, which means you are compiling loader package on local machine but GPDB server is located on a separate machine. -To run this test, first source greenplum_path.sh in +To run this test, first source cloudberry-env.sh in cloudberry-db-devel, not the greenplum-loaders-devel, because loader package don't have the necessary diff tools. diff --git a/gpMgmt/bin/gpload_test/gpload2/TEST_local_base.py b/gpMgmt/bin/gpload_test/gpload2/TEST_local_base.py index 5c47e724ff3..45d885ec87a 100755 --- a/gpMgmt/bin/gpload_test/gpload2/TEST_local_base.py +++ b/gpMgmt/bin/gpload_test/gpload2/TEST_local_base.py @@ -95,7 +95,7 @@ def getPortCoordinatorOnly(host = 'localhost',coordinator_value = None, coordinator_pattern = r"Context:\s*-1\s*Value:\s*\d+" command = "gpconfig -s %s" % ( "port" ) - cmd = "source %s/greenplum_path.sh; export COORDINATOR_DATA_DIRECTORY=%s; export PGPORT=%s; %s" \ + cmd = "source %s/cloudberry-env.sh; export COORDINATOR_DATA_DIRECTORY=%s; export PGPORT=%s; %s" \ % (gphome, cdd, port, command) (ok,out) = run(cmd) diff --git a/gpMgmt/bin/gplogfilter b/gpMgmt/bin/gplogfilter index 693fb7c2f05..a667ae34002 100755 --- a/gpMgmt/bin/gplogfilter +++ b/gpMgmt/bin/gplogfilter @@ -24,7 +24,7 @@ try: from gppylib.logfilter import * from gppylib.commands.gp import get_coordinatordatadir except ImportError as e: - sys.exit('ERROR: Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) + sys.exit('ERROR: Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(e)) # These values are from cdb-pg/src/backend/po/*.po TROUBLE_VALUES = [ diff --git a/gpMgmt/bin/gpmemwatcher b/gpMgmt/bin/gpmemwatcher index 6569015bc09..1fb39adfad8 100755 --- a/gpMgmt/bin/gpmemwatcher +++ b/gpMgmt/bin/gpmemwatcher @@ -132,7 +132,7 @@ def launchProcess(host, workdir): if not gphome: raise Exception('Environment Variable GPHOME not set') - py_string = 'source ' + os.path.join(gphome, 'greenplum_path.sh') + '; ' + py_string = 'source ' + os.path.join(gphome, 'cloudberry-env.sh') + '; ' # Now let's just quick check the host as to whether the python version is >= 2.6 try: diff --git a/gpMgmt/bin/gpmovemirrors b/gpMgmt/bin/gpmovemirrors index 7220a0f897e..c5e694399a5 100755 --- a/gpMgmt/bin/gpmovemirrors +++ b/gpMgmt/bin/gpmovemirrors @@ -30,7 +30,7 @@ try: from gppylib.operations.update_pg_hba_on_segments import update_pg_hba_for_new_mirrors except ImportError as e: - sys.exit('ERROR: Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) + sys.exit('ERROR: Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(e)) # constants GPDB_STOPPED = 1 diff --git a/gpMgmt/bin/gppylib/commands/base.py b/gpMgmt/bin/gppylib/commands/base.py index 98481425071..d455c6e2d13 100755 --- a/gpMgmt/bin/gppylib/commands/base.py +++ b/gpMgmt/bin/gppylib/commands/base.py @@ -510,10 +510,10 @@ def execute(self, cmd, pickled=False, start_new_session=False): if localhost != self.targetHost: cmd.cmdStr = "ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 " \ "{targethost} \"{gphome} {cmdstr}\"".format(targethost=self.targetHost, - gphome=". %s/greenplum_path.sh;" % self.gphome, + gphome=". %s/cloudberry-env.sh;" % self.gphome, cmdstr=cmd.cmdStr) else: - cmd.cmdStr = "bash -c \"{gphome} {cmdstr}\"".format(gphome=". %s/greenplum_path.sh;" % self.gphome, + cmd.cmdStr = "bash -c \"{gphome} {cmdstr}\"".format(gphome=". %s/cloudberry-env.sh;" % self.gphome, cmdstr=cmd.cmdStr) LocalExecutionContext.execute(self, cmd, pickled=pickled, start_new_session=start_new_session) if (cmd.get_stderr().startswith('ssh_exchange_identification: Connection closed by remote host')): diff --git a/gpMgmt/bin/gppylib/commands/test/unit/test_unit_base.py b/gpMgmt/bin/gppylib/commands/test/unit/test_unit_base.py index 6de0f145edf..9f9e6105dad 100644 --- a/gpMgmt/bin/gppylib/commands/test/unit/test_unit_base.py +++ b/gpMgmt/bin/gppylib/commands/test/unit/test_unit_base.py @@ -17,13 +17,13 @@ def test_RemoteExecutionContext_uses_default_gphome(self): self.subject = RemoteExecutionContext("myhost", "my_stdin") cmd = Command("dummy name", "echo 'foo'") self.subject.execute(cmd) - self.assertIn(". %s/greenplum_path.sh;" % GPHOME, cmd.cmdStr) + self.assertIn(". %s/cloudberry-env.sh;" % GPHOME, cmd.cmdStr) def test_RemoteExecutionContext_uses_provided_gphome_when_set(self): self.subject = RemoteExecutionContext(targetHost="myhost", stdin="my_stdin", gphome="other/gphome") cmd = Command("dummy name", "echo 'foo'") self.subject.execute(cmd) - self.assertIn(". other/gphome/greenplum_path.sh;", cmd.cmdStr) + self.assertIn(". other/gphome/cloudberry-env.sh;", cmd.cmdStr) def test_LocalExecutionContext_uses_no_environment(self): self.subject = LocalExecutionContext(None) @@ -53,7 +53,7 @@ def test_RemoteExecutionContext_uses_ampersand_multiple(self): cmd.propagate_env_map['bar'] = 1 self.subject.execute(cmd) self.assertEqual("bar=1 && foo=1 && ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 localhost " - "\". gphome/greenplum_path.sh; bar=1 && foo=1 && ls /tmp\"", cmd.cmdStr) + "\". gphome/cloudberry-env.sh; bar=1 && foo=1 && ls /tmp\"", cmd.cmdStr) @patch('gppylib.commands.base.Command.get_stderr', return_value="ssh_exchange_identification: Connection closed by remote host") diff --git a/gpMgmt/bin/gppylib/gpresgroup.py b/gpMgmt/bin/gppylib/gpresgroup.py index e66e44e7ffd..7c36659c277 100644 --- a/gpMgmt/bin/gppylib/gpresgroup.py +++ b/gpMgmt/bin/gppylib/gpresgroup.py @@ -8,6 +8,7 @@ from gppylib.commands.gp import * from gppylib.gparray import GpArray from gppylib.gplog import get_default_logger +from gppylib.db import dbconn class GpResGroup(object): @@ -40,13 +41,40 @@ def validate(): @staticmethod def validate_v2(): + """ + Validate cgroup v2 configuration on all hosts. + + This method: + 1. Connects to the master database to retrieve gp_resource_group_cgroup_parent + 2. Passes this value to gpcheckresgroupv2impl on each host via command line + 3. Each host validates its local cgroup filesystem permissions + """ pool = base.WorkerPool() gp_array = GpArray.initFromCatalog(dbconn.DbURL(), utility=True) host_list = list(set(gp_array.get_hostlist(True))) msg = None + # Get cgroup_parent value from master database + cgroup_parent = None + try: + # Connect to master database to get the GUC parameter + master_dburl = dbconn.DbURL() + with dbconn.connect(master_dburl, utility=True) as conn: + sql = "SHOW gp_resource_group_cgroup_parent" + cursor = dbconn.query(conn, sql) + result = cursor.fetchone() + if result and result[0]: + cgroup_parent = result[0] + else: + return "failed to retrieve gp_resource_group_cgroup_parent parameter from master database" + except Exception as e: + return "failed to retrieve gp_resource_group_cgroup_parent parameter: {}".format(str(e)) + + # Build command with cgroup_parent parameter + cmd_str = "gpcheckresgroupv2impl --cgroup-parent '{}'".format(cgroup_parent) + for h in host_list: - cmd = Command(h, "gpcheckresgroupv2impl", REMOTE, h) + cmd = Command(h, cmd_str, REMOTE, h) pool.addCommand(cmd) pool.join() diff --git a/gpMgmt/bin/gppylib/operations/package.py b/gpMgmt/bin/gppylib/operations/package.py index 2af3bfba381..f6f52a634ae 100644 --- a/gpMgmt/bin/gppylib/operations/package.py +++ b/gpMgmt/bin/gppylib/operations/package.py @@ -24,7 +24,7 @@ from yaml.scanner import ScannerError except ImportError as ex: sys.exit( - 'Operation: Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(ex)) + 'Operation: Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(ex)) logger = gplog.get_default_logger() diff --git a/gpMgmt/bin/gppylib/programs/clsRecoverSegment.py b/gpMgmt/bin/gppylib/programs/clsRecoverSegment.py index b7018bc8a3f..81377136f83 100644 --- a/gpMgmt/bin/gppylib/programs/clsRecoverSegment.py +++ b/gpMgmt/bin/gppylib/programs/clsRecoverSegment.py @@ -378,11 +378,6 @@ def signal_handler(sig, frame): if not mirrorBuilder.recover_mirrors(gpEnv, gpArray): if self.termination_requested: self.logger.error("gprecoverseg process was interrupted by the user.") - if self.__options.differentialResynchronization: - self.logger.error("gprecoverseg differential recovery failed. Please check the gpsegrecovery.py log" - " file and rsync log file for more details.") - else: - self.logger.error("gprecoverseg failed. Please check the output for more details.") sys.exit(1) if self.termination_requested: diff --git a/gpMgmt/bin/gppylib/programs/gppkg.py b/gpMgmt/bin/gppylib/programs/gppkg.py index f784386ea5f..70682b67a1c 100755 --- a/gpMgmt/bin/gppylib/programs/gppkg.py +++ b/gpMgmt/bin/gppylib/programs/gppkg.py @@ -20,7 +20,7 @@ from gppylib.operations.unix import ListFilesByPattern except ImportError as ex: - sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(ex)) + sys.exit('Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(ex)) logger = gplog.get_default_logger() diff --git a/gpMgmt/bin/gppylib/utils.py b/gpMgmt/bin/gppylib/utils.py index fead818d3ac..1adb1e68cdc 100644 --- a/gpMgmt/bin/gppylib/utils.py +++ b/gpMgmt/bin/gppylib/utils.py @@ -290,7 +290,7 @@ def deleteBlock(fileName,beginPattern, endPattern): print(("IOERROR", IOError)) sys.exit() else: - print("***********%s file does not exits"%(fileName)) + print("***********%s file does not exist"%(fileName)) def make_inf_hosts(hp, hstart, hend, istart, iend, hf=None): hfArr = [] diff --git a/gpMgmt/bin/gpshrink b/gpMgmt/bin/gpshrink index 3f6f145d723..6a8d8f5a11f 100644 --- a/gpMgmt/bin/gpshrink +++ b/gpMgmt/bin/gpshrink @@ -57,7 +57,7 @@ try: from gppylib.operations.update_pg_hba_on_segments import update_pg_hba_on_segments except ImportError as e: - sys.exit('ERROR: Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) + sys.exit('ERROR: Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(e)) # constants MAX_PARALLEL_SHRINKS = 96 diff --git a/gpMgmt/bin/gpssh b/gpMgmt/bin/gpssh index b27cb5cf47b..f3105599d21 100755 --- a/gpMgmt/bin/gpssh +++ b/gpMgmt/bin/gpssh @@ -211,7 +211,7 @@ def interactive(): GV.session.login(GV.opt['-h'], GV.USER, GV.DELAY_BEFORE_SEND, GV.PROMPT_VALIDATION_TIMEOUT, GV.SYNC_RETRIES) GV.session.echoCommand = GV.opt['-e'] if GV.opt['-s']: - GV.session.executeCommand("source {0}/greenplum_path.sh".format(os.environ["GPHOME"])) + GV.session.executeCommand("source {0}/cloudberry-env.sh".format(os.environ["GPHOME"])) GV.session.cmdloop() except pexpect.EOF: print('\n[Unexpected EOF from some hosts...]') @@ -265,7 +265,7 @@ def main(): GV.session.login(GV.opt['-h'], GV.USER, GV.DELAY_BEFORE_SEND, GV.PROMPT_VALIDATION_TIMEOUT, GV.SYNC_RETRIES) GV.session.echoCommand = GV.opt['-e'] if GV.opt['-s']: - GV.session.executeCommand("source {0}/greenplum_path.sh".format(os.environ["GPHOME"])) + GV.session.executeCommand("source {0}/cloudberry-env.sh".format(os.environ["GPHOME"])) output = GV.session.executeCommand(GV.argcmd) GV.session.writeCommandOutput(output) if GV.session.verbose: print('[INFO] completed successfully') diff --git a/gpMgmt/bin/gpstart b/gpMgmt/bin/gpstart index 6937d86ac51..bc1f60b6c3d 100755 --- a/gpMgmt/bin/gpstart +++ b/gpMgmt/bin/gpstart @@ -34,7 +34,7 @@ try: from gppylib.utils import TableLogger from gppylib.gp_era import GpEraFile except ImportError as e: - sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) + sys.exit('Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(e)) logger = get_default_logger() @@ -902,9 +902,7 @@ class GpStart: if is_external_fts: if options.fts_hosts is None: - coordinator_data_directory = os.getenv('COORDINATOR_DATA_DIRECTORY') - if coordinator_data_directory is None: - coordinator_data_directory = options.coordinatorDataDirectory + coordinator_data_directory = gp.get_coordinatordatadir() options.fts_hosts = coordinator_data_directory + '/config' + '/fts_host' return GpStart(options.specialMode, options.restricted, diff --git a/gpMgmt/bin/gpstop b/gpMgmt/bin/gpstop index 5f89bb90b02..ab7655597c9 100755 --- a/gpMgmt/bin/gpstop +++ b/gpMgmt/bin/gpstop @@ -36,7 +36,7 @@ try: from gppylib.operations.rebalanceSegments import ReconfigDetectionSQLQueryCommand from gppylib.operations.detect_unreachable_hosts import get_unreachable_segment_hosts except ImportError as e: - sys.exit('ERROR: Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) + sys.exit('ERROR: Cannot import modules. Please check that you have sourced cloudberry-env.sh. Detail: ' + str(e)) DEFAULT_NUM_WORKERS = 64 logger = get_default_logger() @@ -959,9 +959,7 @@ class GpStop: if is_external_fts: if options.fts_hosts is None: - coordinator_data_directory = os.getenv('COORDINATOR_DATA_DIRECTORY') - if coordinator_data_directory is None: - coordinator_data_directory = options.coordinatorDataDirectory + coordinator_data_directory = gp.get_coordinatordatadir() options.fts_hosts = coordinator_data_directory + '/config' + '/fts_host' diff --git a/gpMgmt/bin/lib/gp_bash_functions.sh b/gpMgmt/bin/lib/gp_bash_functions.sh index 2276cdb900c..de76620da2b 100755 --- a/gpMgmt/bin/lib/gp_bash_functions.sh +++ b/gpMgmt/bin/lib/gp_bash_functions.sh @@ -28,8 +28,8 @@ declare -a GPPATH GPPATH=( $GPHOME $MPPHOME $BIZHOME ) if [ ${#GPPATH[@]} -eq 0 ];then echo "[FATAL]:-GPHOME environment variable is required to run GPDB but could not be found." - echo "Please set it by sourcing the greenplum_path.sh in your GPDB installation directory." - echo "Example: ''. /usr/local/gpdb/greenplum_path.sh''" + echo "Please set it by sourcing the cloudberry-env.sh in your GPDB installation directory." + echo "Example: ''. /usr/local/gpdb/cloudberry-env.sh''" exit 1 fi diff --git a/gpMgmt/bin/lib/gpcreateseg.sh b/gpMgmt/bin/lib/gpcreateseg.sh index 2dfa970d864..5dd0f5b0006 100755 --- a/gpMgmt/bin/lib/gpcreateseg.sh +++ b/gpMgmt/bin/lib/gpcreateseg.sh @@ -222,8 +222,8 @@ CREATE_QES_MIRROR () { PG_HBA_ENTRIES="${PG_HBA_ENTRIES}"$'\n'"host replication ${GP_USER} ${PRIMARY_HOSTADDRESS} trust" fi fi - RUN_COMMAND_REMOTE ${PRIMARY_HOSTADDRESS} "${EXPORT_GPHOME}; . ${GPHOME}/greenplum_path.sh; cat - >> ${PRIMARY_DIR}/pg_hba.conf; pg_ctl -D ${PRIMARY_DIR} reload" <<< "${PG_HBA_ENTRIES}" - RUN_COMMAND_REMOTE ${GP_HOSTADDRESS} "${EXPORT_GPHOME}; . ${GPHOME}/greenplum_path.sh; rm -rf ${GP_DIR}; ${GPHOME}/bin/pg_basebackup --wal-method=stream --create-slot --slot='internal_wal_replication_slot' -R -c fast -E ./db_dumps -D ${GP_DIR} -h ${PRIMARY_HOSTADDRESS} -p ${PRIMARY_PORT} --target-gp-dbid ${GP_DBID};" + RUN_COMMAND_REMOTE ${PRIMARY_HOSTADDRESS} "${EXPORT_GPHOME}; . ${GPHOME}/cloudberry-env.sh; cat - >> ${PRIMARY_DIR}/pg_hba.conf; pg_ctl -D ${PRIMARY_DIR} reload" <<< "${PG_HBA_ENTRIES}" + RUN_COMMAND_REMOTE ${GP_HOSTADDRESS} "${EXPORT_GPHOME}; . ${GPHOME}/cloudberry-env.sh; rm -rf ${GP_DIR}; ${GPHOME}/bin/pg_basebackup --wal-method=stream --create-slot --slot='internal_wal_replication_slot' -R -c fast -E ./db_dumps -D ${GP_DIR} -h ${PRIMARY_HOSTADDRESS} -p ${PRIMARY_PORT} --target-gp-dbid ${GP_DBID};" START_QE "-w" RETVAL=$? PARA_EXIT $RETVAL "pg_basebackup of segment data directory from ${PRIMARY_HOSTADDRESS} to ${GP_HOSTADDRESS}" diff --git a/gpMgmt/doc/gpfdist_help b/gpMgmt/doc/gpfdist_help index 4ad1f46ce83..cdb298662fc 100755 --- a/gpMgmt/doc/gpfdist_help +++ b/gpMgmt/doc/gpfdist_help @@ -10,7 +10,7 @@ SYNOPSIS gpfdist [-d ] [-p ] [-l ] [-t ] [-S] [-w