Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions paimon-python/pypaimon/table/row/binary_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,19 @@ def __init__(self, data: bytes, fields: List[DataField]):
Initialize BinaryRow with raw binary data and field definitions.
"""
self.data = data
self.arity = int.from_bytes(data[:4], 'big')
# Skip the arity prefix (4 bytes) if present
self.actual_data = data[4:] if len(data) >= 4 else data
if len(data) < 4:
self.arity = 0
self.actual_data = b''
else:
self.arity = int.from_bytes(data[:4], 'big')
self.actual_data = data[4:] if len(data) >= 4 else data

self.fields = fields
self.row_kind = RowKind(self.actual_data[0])

if len(self.actual_data) == 0:
self.row_kind = RowKind.INSERT
else:
self.row_kind = RowKind(self.actual_data[0])

def get_field(self, index: int) -> Any:
from pypaimon.table.row.generic_row import GenericRowDeserializer
Expand Down
21 changes: 21 additions & 0 deletions paimon-python/pypaimon/tests/binary_row_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
from pypaimon.manifest.schema.manifest_entry import ManifestEntry
from pypaimon.manifest.schema.simple_stats import SimpleStats
from pypaimon.read.scanner.full_starting_scanner import FullStartingScanner
from pypaimon.table.row.binary_row import BinaryRow
from pypaimon.table.row.generic_row import GenericRow, GenericRowDeserializer
from pypaimon.table.row.row_kind import RowKind


def _random_format():
Expand Down Expand Up @@ -332,3 +334,22 @@ def _overwrite_manifest_entry(self, table):
[0],
)
starting_scanner.manifest_file_manager.write(manifest_files[0].file_name, manifest_entries)

def test_binary_row_with_empty_actual_data(self):
empty_data = b''
binary_row = BinaryRow(empty_data, [])
self.assertEqual(binary_row.arity, 0)
self.assertEqual(binary_row.actual_data, b'')
self.assertEqual(binary_row.get_row_kind(), RowKind.INSERT)

arity_zero_data = (0).to_bytes(4, 'big') # b'\x00\x00\x00\x00'
binary_row = BinaryRow(arity_zero_data, [])
self.assertEqual(binary_row.arity, 0)
self.assertEqual(binary_row.actual_data, b'')
self.assertEqual(binary_row.get_row_kind(), RowKind.INSERT)

normal_empty_row = (0).to_bytes(4, 'big') + b'\x00' * 8 # 12 bytes total
binary_row = BinaryRow(normal_empty_row, [])
self.assertEqual(binary_row.arity, 0)
self.assertEqual(len(binary_row.actual_data), 8)
self.assertEqual(binary_row.get_row_kind(), RowKind.INSERT)
37 changes: 37 additions & 0 deletions paimon-python/pypaimon/tests/reader_base_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,23 @@ def test_full_data_types(self):
[]).values
self.assertEqual(min_value_stats, [])
self.assertEqual(max_value_stats, [])

self.assertGreater(
len(manifest_entries[0].file.value_stats.min_values.data), 0,
"MIN_VALUES.data should have bytes written, not empty (0 bytes)")
self.assertGreaterEqual(
len(manifest_entries[0].file.value_stats.min_values.data), 12,
f"MIN_VALUES.data should be at least 12 bytes "
f"(4 bytes arity + 8 bytes fixed part for empty GenericRow), "
f"but got {len(manifest_entries[0].file.value_stats.min_values.data)} bytes")
self.assertGreater(
len(manifest_entries[0].file.value_stats.max_values.data), 0,
"MAX_VALUES.data should have bytes written, not empty (0 bytes)")
self.assertGreaterEqual(
len(manifest_entries[0].file.value_stats.max_values.data), 12,
f"MAX_VALUES.data should be at least 12 bytes "
f"(4 bytes arity + 8 bytes fixed part for empty GenericRow), "
f"but got {len(manifest_entries[0].file.value_stats.max_values.data)} bytes")

def test_write_wrong_schema(self):
self.catalog.create_table('default.test_wrong_schema',
Expand Down Expand Up @@ -623,6 +640,26 @@ def test_value_stats_empty_when_stats_disabled(self):
len(file_meta.value_stats.null_counts), len(empty_stats.null_counts),
"value_stats.null_counts should be empty (same as SimpleStats.empty_stats()) when stats are disabled"
)

# 验证Python确实写入了字节数,而不是空的MIN_VALUES
# 即使min_values是empty (GenericRow([], [])),Python也应该序列化为12字节
# (4字节arity + 8字节fixed part),而不是0字节或4字节
self.assertGreater(
len(file_meta.value_stats.min_values.data), 0,
"MIN_VALUES.data should have bytes written, not empty (0 bytes)")
self.assertGreaterEqual(
len(file_meta.value_stats.min_values.data), 12,
f"MIN_VALUES.data should be at least 12 bytes "
f"(4 bytes arity + 8 bytes fixed part for empty GenericRow), "
f"but got {len(file_meta.value_stats.min_values.data)} bytes")
self.assertGreater(
len(file_meta.value_stats.max_values.data), 0,
"MAX_VALUES.data should have bytes written, not empty (0 bytes)")
self.assertGreaterEqual(
len(file_meta.value_stats.max_values.data), 12,
f"MAX_VALUES.data should be at least 12 bytes "
f"(4 bytes arity + 8 bytes fixed part for empty GenericRow), "
f"but got {len(file_meta.value_stats.max_values.data)} bytes")

def test_types(self):
data_fields = [
Expand Down