Skip to content

Commit fd0c317

Browse files
authored
fix: parquet statistics (#2248)
dont save string length into min/max statistics
1 parent fea1c66 commit fd0c317

3 files changed

Lines changed: 70 additions & 7 deletions

File tree

src/lib/parquet/src/Flow/Parquet/ParquetFile/RowGroup/StatisticsReader.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public function max(FlatColumn $column) : mixed
2727
return null;
2828
}
2929

30-
if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->max, 'UTF-8')) {
30+
if (ColumnPrimitiveType::isString($column)) {
3131
return $this->statistics->max;
3232
}
3333

@@ -40,7 +40,7 @@ public function maxValue(FlatColumn $column) : mixed
4040
return null;
4141
}
4242

43-
if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->maxValue, 'UTF-8')) {
43+
if (ColumnPrimitiveType::isString($column)) {
4444
return $this->statistics->maxValue;
4545
}
4646

@@ -53,7 +53,7 @@ public function min(FlatColumn $column) : mixed
5353
return null;
5454
}
5555

56-
if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->min, 'UTF-8')) {
56+
if (ColumnPrimitiveType::isString($column)) {
5757
return $this->statistics->min;
5858
}
5959

@@ -66,7 +66,7 @@ public function minValue(FlatColumn $column) : mixed
6666
return null;
6767
}
6868

69-
if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->minValue, 'UTF-8')) {
69+
if (ColumnPrimitiveType::isString($column)) {
7070
return $this->statistics->minValue;
7171
}
7272

src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
use Flow\Parquet\Data\PlainValuesPacker;
1010
use Flow\Parquet\Dremel\Statistics\Comparator;
1111
use Flow\Parquet\Exception\InvalidArgumentException;
12-
use Flow\Parquet\ParquetFile\Schema\FlatColumn;
12+
use Flow\Parquet\ParquetFile\Schema\{FlatColumn, PhysicalType};
1313
use Flow\Parquet\ParquetFile\Statistics;
1414

1515
final class StatisticsCounter
@@ -126,8 +126,24 @@ public function toStatistics() : Statistics
126126
$minBuffer = '';
127127
$maxBuffer = '';
128128

129-
(new PlainValuesPacker(new BinaryBufferWriter($minBuffer)))->packValues($this->column, [$this->min()]);
130-
(new PlainValuesPacker(new BinaryBufferWriter($maxBuffer)))->packValues($this->column, [$this->max()]);
129+
$min = $this->min();
130+
$max = $this->max();
131+
132+
if ($min !== null) {
133+
if ($this->column->type() === PhysicalType::BYTE_ARRAY && \is_string($min)) {
134+
(new BinaryBufferWriter($minBuffer))->append($min);
135+
} else {
136+
(new PlainValuesPacker(new BinaryBufferWriter($minBuffer)))->packValues($this->column, [$min]);
137+
}
138+
}
139+
140+
if ($max !== null) {
141+
if ($this->column->type() === PhysicalType::BYTE_ARRAY && \is_string($max)) {
142+
(new BinaryBufferWriter($maxBuffer))->append($max);
143+
} else {
144+
(new PlainValuesPacker(new BinaryBufferWriter($maxBuffer)))->packValues($this->column, [$max]);
145+
}
146+
}
131147

132148
return new Statistics(
133149
max: $maxBuffer !== '' ? $maxBuffer : null,

src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,53 @@ public function test_reset() : void
463463
self::assertNull($statistics->max());
464464
}
465465

466+
public function test_to_statistics_encodes_byte_array_without_length_prefix() : void
467+
{
468+
$column = FlatColumn::string('test_column');
469+
$statistics = new StatisticsCounter($column);
470+
471+
$statistics->add('hello');
472+
$statistics->add('world');
473+
474+
$result = $statistics->toStatistics();
475+
476+
self::assertSame('hello', $result->min);
477+
self::assertSame('world', $result->max);
478+
self::assertSame('hello', $result->minValue);
479+
self::assertSame('world', $result->maxValue);
480+
}
481+
482+
public function test_to_statistics_with_int32_encodes_with_packer() : void
483+
{
484+
$column = FlatColumn::int32('test_column');
485+
$statistics = new StatisticsCounter($column);
486+
487+
$statistics->add(5);
488+
$statistics->add(10);
489+
490+
$result = $statistics->toStatistics();
491+
492+
self::assertSame(\pack('l', 5), $result->min);
493+
self::assertSame(\pack('l', 10), $result->max);
494+
}
495+
496+
public function test_to_statistics_with_null_values_does_not_encode() : void
497+
{
498+
$column = FlatColumn::string('test_column');
499+
$statistics = new StatisticsCounter($column);
500+
501+
$statistics->add(null);
502+
$statistics->add(null);
503+
504+
$result = $statistics->toStatistics();
505+
506+
self::assertNull($result->min);
507+
self::assertNull($result->max);
508+
self::assertNull($result->minValue);
509+
self::assertNull($result->maxValue);
510+
self::assertSame(2, $result->nullCount);
511+
}
512+
466513
public function test_values_count_calculation_with_arrays() : void
467514
{
468515
$column = FlatColumn::string('test_column');

0 commit comments

Comments
 (0)