-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Search before asking
- I searched in the issues and found nothing similar.
Paimon version
1.3
Compute Engine
spark
Minimal reproduce step
# reproduce_paimon_error.py
import configparser
import os
import traceback
import logging
from pypaimon.catalog.catalog_context import CatalogContext
from pypaimon.catalog.rest.rest_catalog import RESTCatalog
from pypaimon.api.options import Options
# --- 复用 paimon_dataset_v3.py 中的配置加载和日志逻辑 ---
# 1. 设置一个简单的日志记录器
logger = logging.getLogger("PaimonTest")
if not logger.handlers:
handler = logging.StreamHandler()
fmt = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s: %(message)s")
handler.setFormatter(fmt)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
# 2. 复用配置加载函数,确保连接信息一致
def _load_paimon_catalog_config() -> dict:
"""从环境变量与配置文件加载 Paimon catalog 配置;环境变量优先。"""
config = {
'metastore': 'rest', # rest, hive, filesystem
'uri': None,
'warehouse': None,
'dlf.region': None,
'token.provider': None,
'dlf.access-key-id': None,
'dlf.access-key-secret': None,
}
# 从配置文件读取
default_path = os.getenv("HOME", "/home/admin") + "/.paimon_config.ini"
config_path = os.getenv('PAIMON_CONFIG_PATH', default_path)
if os.path.exists(config_path):
try:
parser = configparser.ConfigParser()
parser.read(config_path)
mappings = {
'metastore': ('metastore', str),
'uri': ('uri', str),
'warehouse': ('warehouse', str),
'dlf.region': ('dlf.region', str),
'token.provider': ('token.provider', str),
'dlf.access-key-id': ('dlf.access-key-id', str),
'dlf.access-key-secret': ('dlf.access-key-secret', str),
}
for file_key, (cfg_key, cast) in mappings.items():
if parser.has_option('DEFAULT', file_key):
config[cfg_key] = cast(parser.get('DEFAULT', file_key))
except Exception as e:
logger.warning("Failed to load Paimon config from %s: %s", config_path, e)
else:
logger.info("Paimon config file not found: %s", config_path)
# 环境变量覆盖
env_mappings = {
'PAIMON_METASTORE': ('metastore', str),
'PAIMON_URI': ('uri', str),
'PAIMON_WAREHOUSE': ('warehouse', str),
'DLF_REGION': ('dlf.region', str),
'PAIMON_TOKEN_PROVIDER': ('token.provider', str),
'DLF_ACCESS_KEY_ID': ('dlf.access-key-id', str),
'DLF_ACCESS_KEY_SECRET': ('dlf.access-key-secret', str),
}
for env_key, (cfg_key, cast) in env_mappings.items():
v = os.getenv(env_key)
if v:
config[cfg_key] = cast(v)
return config
# --- 核心复现逻辑 ---
def main():
"""
直接调用 pypaimon API 复现 IndexError 的核心逻辑。
"""
# --- 参考 paimon_dataset_test_v3.py 设置配置 ---
os.environ['PAIMON_CONFIG_PATH'] = '/Users/kl/github/paimon/paimon-python/pypaimon/tests/paimon_config.ini'
TABLE_TO_TEST = "adn.wide_table_200cols/dt=2025-09-01"
# ----------------------------------------------
logger.info(f"开始测试,目标表: {TABLE_TO_TEST}")
logger.info(f"使用配置文件: {os.environ.get('PAIMON_CONFIG_PATH')}")
logger.info("第一步: 加载 Paimon catalog 配置...")
try:
# 1. 加载配置并创建 Catalog
cfg = _load_paimon_catalog_config()
options = {k: v for k, v in cfg.items() if v is not None}
catalog = RESTCatalog(CatalogContext.create_from_options(Options(options)))
logger.info("Catalog 创建成功. uri=%s", cfg.get('uri'))
# 2. 获取表对象
logger.info("第二步: 获取表对象...")
table_name_full = TABLE_TO_TEST
table_name = table_name_full.split('/', 1)[0] if '/' in table_name_full else table_name_full
logger.info(f"解析表名: '{table_name_full}' -> '{table_name}'")
paimon_table = catalog.get_table(table_name)
logger.info(f"表 '{table_name}' 对象获取成功。")
# 3. 创建扫描器并规划
# 这是触发错误的步骤,因为它会去读取 Manifest 文件
logger.info("第三步: 创建扫描并执行 plan() 操作(这将触发 Manifest 读取)...")
scan = paimon_table.new_read_builder().new_scan()
# scan.plan() 是整个错误的触发点
scan_plan = scan.plan()
# 如果代码能执行到这里,说明没有复现出错误
splits = scan_plan.splits()
logger.info(f"操作成功完成,未发生错误。共找到 {len(splits)} 个 split。")
except IndexError:
logger.error("成功复现 'IndexError: index out of range'!")
logger.error("这确认了问题发生在 pypaimon 读取 Paimon 表元数据(Manifest)的底层过程中。")
traceback.print_exc()
except Exception as e:
logger.error(f"发生了预料之外的错误: {e}")
traceback.print_exc()
if __name__ == "__main__":
main()What doesn't meet your expectations?
[2026-01-06 18:42:07,302] INFO:PaimonTest: 开始测试,目标表: adn.wide_table_200cols/dt=2025-09-01
[2026-01-06 18:42:07,302] INFO:PaimonTest: 使用配置文件: /Users/kl/github/paimon/paimon-python/pypaimon/tests/paimon_config.ini
[2026-01-06 18:42:07,302] INFO:PaimonTest: 第一步: 加载 Paimon catalog 配置...
[2026-01-06 18:42:07,384] INFO:PaimonTest: Catalog 创建成功. uri=https://cn-shanghai-vpc.dlf.aliyuncs.com
[2026-01-06 18:42:07,386] INFO:PaimonTest: 第二步: 获取表对象...
[2026-01-06 18:42:07,386] INFO:PaimonTest: 解析表名: 'adn.wide_table_200cols/dt=2025-09-01' -> 'adn.wide_table_200cols'
[2026-01-06 18:42:09,495] INFO:PaimonTest: 表 'adn.wide_table_200cols' 对象获取成功。
[2026-01-06 18:42:09,496] INFO:PaimonTest: 第三步: 创建扫描并执行 plan() 操作(这将触发 Manifest 读取)...
[2026-01-06 18:42:09,934] ERROR:PaimonTest: 成功复现 'IndexError: index out of range'!
[2026-01-06 18:42:09,934] ERROR:PaimonTest: 这确认了问题发生在 pypaimon 读取 Paimon 表元数据(Manifest)的底层过程中。
Traceback (most recent call last):
File "/Users/kl/PycharmProjects/tapio/reproduce_paimon_error.py", line 115, in main
scan_plan = scan.plan()
File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/table_scan.py", line 45, in plan
return self.starting_scanner.scan()
File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py", line 77, in scan
file_entries = self.plan_files()
File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py", line 95, in plan_files
return self.read_manifest_entries(manifest_files)
File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/read/scanner/full_starting_scanner.py", line 104, in read_manifest_entries
max_workers=max_workers)
File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py", line 57, in read_entries_parallel
for entries in future_results:
File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", line 586, in result_iterator
yield fs.pop().result()
File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", line 432, in result
return self.__get_result()
File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
File "/Users/kl/.pyenv/versions/3.6.15/lib/python3.6/concurrent/futures/thread.py", line 56, in run
result = self.fn(*self.args, **self.kwargs)
File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py", line 51, in _process_single_manifest
return self.read(manifest_file.file_name, manifest_entry_filter, drop_stats)
File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/manifest/manifest_file_manager.py", line 84, in read
min_values=BinaryRow(key_dict['_MIN_VALUES'], self.trimmed_primary_keys_fields),
File "/Users/kl/PycharmProjects/tapio/.venv/lib/python3.6/site-packages/pypaimon/table/row/binary_row.py", line 38, in __init__
self.row_kind = RowKind(self.actual_data[0])
IndexError: index out of rangeAnything else?
- Python: 3.6.15
- pyarrow: 6.0.1
- pypaimon: 1.3.1
It's worth noting that this issue does not occur with the development version pypaimon==0.3.dev. I have tested the same code against the 0.3.dev version, and it runs
correctly without raising an IndexError.
Are you willing to submit a PR?
- I'm willing to submit a PR!
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working