Skip to content

[Bug][PyPaimon] IndexError: index out of range when reading manifest with empty _MIN_VALUES #6962

@klboke

Description

@klboke

Search before asking

  • I searched in the issues and found nothing similar.

Paimon version

1.3

Compute Engine

spark

Minimal reproduce step

# reproduce_paimon_error.py
import configparser
import os
import traceback
import logging

from pypaimon.catalog.catalog_context import CatalogContext
from pypaimon.catalog.rest.rest_catalog import RESTCatalog
from pypaimon.api.options import Options

# --- 复用 paimon_dataset_v3.py 中的配置加载和日志逻辑 ---

# 1. 设置一个简单的日志记录器
logger = logging.getLogger("PaimonTest")
if not logger.handlers:
    handler = logging.StreamHandler()
    fmt = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s: %(message)s")
    handler.setFormatter(fmt)
    logger.addHandler(handler)
logger.setLevel(logging.INFO)


# 2. 复用配置加载函数,确保连接信息一致
def _load_paimon_catalog_config() -> dict:
    """从环境变量与配置文件加载 Paimon catalog 配置;环境变量优先。"""
    config = {
        'metastore': 'rest',  # rest, hive, filesystem
        'uri': None,
        'warehouse': None,
        'dlf.region': None,
        'token.provider': None,
        'dlf.access-key-id': None,
        'dlf.access-key-secret': None,
    }

    # 从配置文件读取
    default_path = os.getenv("HOME", "/home/admin") + "/.paimon_config.ini"
    config_path = os.getenv('PAIMON_CONFIG_PATH', default_path)

    if os.path.exists(config_path):
        try:
            parser = configparser.ConfigParser()
            parser.read(config_path)
            mappings = {
                'metastore': ('metastore', str),
                'uri': ('uri', str),
                'warehouse': ('warehouse', str),
                'dlf.region': ('dlf.region', str),
                'token.provider': ('token.provider', str),
                'dlf.access-key-id': ('dlf.access-key-id', str),
                'dlf.access-key-secret': ('dlf.access-key-secret', str),
            }
            for file_key, (cfg_key, cast) in mappings.items():
                if parser.has_option('DEFAULT', file_key):
                    config[cfg_key] = cast(parser.get('DEFAULT', file_key))
        except Exception as e:
            logger.warning("Failed to load Paimon config from %s: %s", config_path, e)
    else:
        logger.info("Paimon config file not found: %s", config_path)

    # 环境变量覆盖
    env_mappings = {
        'PAIMON_METASTORE': ('metastore', str),
        'PAIMON_URI': ('uri', str),
        'PAIMON_WAREHOUSE': ('warehouse', str),
        'DLF_REGION': ('dlf.region', str),
        'PAIMON_TOKEN_PROVIDER': ('token.provider', str),
        'DLF_ACCESS_KEY_ID': ('dlf.access-key-id', str),
        'DLF_ACCESS_KEY_SECRET': ('dlf.access-key-secret', str),
    }
    for env_key, (cfg_key, cast) in env_mappings.items():
        v = os.getenv(env_key)
        if v:
            config[cfg_key] = cast(v)

    return config

# --- 核心复现逻辑 ---

def main():
    """
    直接调用 pypaimon API 复现 IndexError 的核心逻辑。
    """
    # --- 参考 paimon_dataset_test_v3.py 设置配置 ---
    os.environ['PAIMON_CONFIG_PATH'] = '/Users/kl/github/paimon/paimon-python/pypaimon/tests/paimon_config.ini'
    TABLE_TO_TEST = "adn.wide_table_200cols/dt=2025-09-01"
    # ----------------------------------------------

    logger.info(f"开始测试,目标表: {TABLE_TO_TEST}")
    logger.info(f"使用配置文件: {os.environ.get('PAIMON_CONFIG_PATH')}")
    logger.info("第一步: 加载 Paimon catalog 配置...")

    try:
        # 1. 加载配置并创建 Catalog
        cfg = _load_paimon_catalog_config()
        options = {k: v for k, v in cfg.items() if v is not None}

        catalog = RESTCatalog(CatalogContext.create_from_options(Options(options)))
        logger.info("Catalog 创建成功. uri=%s", cfg.get('uri'))

        # 2. 获取表对象
        logger.info("第二步: 获取表对象...")
        table_name_full = TABLE_TO_TEST
        table_name = table_name_full.split('/', 1)[0] if '/' in table_name_full else table_name_full
        logger.info(f"解析表名: '{table_name_full}' -> '{table_name}'")
        paimon_table = catalog.get_table(table_name)
        logger.info(f"表 '{table_name}' 对象获取成功。")

        # 3. 创建扫描器并规划
        # 这是触发错误的步骤,因为它会去读取 Manifest 文件
        logger.info("第三步: 创建扫描并执行 plan() 操作(这将触发 Manifest 读取)...")
        scan = paimon_table.new_read_builder().new_scan()
        
        # scan.plan() 是整个错误的触发点
        scan_plan = scan.plan()

        # 如果代码能执行到这里,说明没有复现出错误
        splits = scan_plan.splits()
        logger.info(f"操作成功完成,未发生错误。共找到 {len(splits)} 个 split。")

    except IndexError:
        logger.error("成功复现 'IndexError: index out of range'!")
        logger.error("这确认了问题发生在 pypaimon 读取 Paimon 表元数据(Manifest)的底层过程中。")
        traceback.print_exc()
    except Exception as e:
        logger.error(f"发生了预料之外的错误: {e}")
        traceback.print_exc()


if __name__ == "__main__":
    main()

What doesn't meet your expectations?

[2026-01-06 18:42:07,302] INFO:PaimonTest: 开始测试,目标表: adn.wide_table_200cols/dt=2025-09-01
[2026-01-06 18:42:07,302] INFO:PaimonTest: 使用配置文件: /Users/kl/github/paimon/paimon-python/pypaimon/tests/paimon_config.ini
[2026-01-06 18:42:07,302] INFO:PaimonTest: 第一步: 加载 Paimon catalog 配置...
[2026-01-06 18:42:07,384] INFO:PaimonTest: Catalog 创建成功. uri=https://cn-shanghai-vpc.dlf.aliyuncs.com
[2026-01-06 18:42:07,386] INFO:PaimonTest: 第二步: 获取表对象...
[2026-01-06 18:42:07,386] INFO:PaimonTest: 解析表名: 'adn.wide_table_200cols/dt=2025-09-01' -> 'adn.wide_table_200cols'
[2026-01-06 18:42:09,495] INFO:PaimonTest: 表 'adn.wide_table_200cols' 对象获取成功。
[2026-01-06 18:42:09,496] INFO:PaimonTest: 第三步: 创建扫描并执行 plan() 操作(这将触发 Manifest 读取)...
[2026-01-06 18:42:09,934] ERROR:PaimonTest: 成功复现 'IndexError: index out of range'!
[2026-01-06 18:42:09,934] ERROR:PaimonTest: 这确认了问题发生在 pypaimon 读取 Paimon 表元数据(Manifest)的底层过程中。
Traceback (most recent call last):
  File "/Users/kl/PycharmProjects/tapio/reproduce_paimon_error.py", line 115, in main
    scan_plan = scan.plan()
  File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/table_scan.py", line 45, in plan
    return self.starting_scanner.scan()
  File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py", line 77, in scan
    file_entries = self.plan_files()
  File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py", line 95, in plan_files
    return self.read_manifest_entries(manifest_files)
  File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py", line 104, in read_manifest_entries
    max_workers=max_workers)
  File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py", line 57, in read_entries_parallel
    for entries in future_results:
  File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", line 586, in result_iterator
    yield fs.pop().result()
  File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", line 432, in result
    return self.__get_result()
  File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", line 384, in __get_result
    raise self._exception
  File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/thread.py", line 56, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py", line 51, in _process_single_manifest
    return self.read(manifest_file.file_name, manifest_entry_filter, drop_stats)
  File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py", line 84, in read
    min_values=BinaryRow(key_dict['_MIN_VALUES'], self.trimmed_primary_keys_fields),
  File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/table/row/binary_row.py", line 38, in __init__
    self.row_kind = RowKind(self.actual_data[0])
IndexError: index out of range

Anything else?

  • Python: 3.6.15
  • pyarrow: 6.0.1
  • pypaimon: 1.3.1

It's worth noting that this issue does not occur with the development version pypaimon==0.3.dev. I have tested the same code against the 0.3.dev version, and it runs
correctly without raising an IndexError.

Are you willing to submit a PR?

  • I'm willing to submit a PR!

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions