From 225266ccccd65c20b899124c814225ec3990b5a3 Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Fri, 26 Dec 2025 23:34:37 +0800 Subject: [PATCH 1/3] feat: implement ValidateRedundantPartitions for PartitionSpec - Add ValidateRedundantPartitions method to detect redundant partition fields - Use deduplication key (source_id + transform.DedupName()) to identify conflicts - Integrate validation into PartitionSpec::Validate method --- src/iceberg/partition_spec.cc | 23 ++ src/iceberg/partition_spec.h | 3 + src/iceberg/test/partition_spec_test.cc | 123 ++++++++ .../test/table_metadata_builder_test.cc | 268 +++++++++++++++++- src/iceberg/transform.cc | 2 + src/iceberg/transform.h | 4 + 6 files changed, 418 insertions(+), 5 deletions(-) diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index af77d1fe0..97a318faa 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -26,6 +26,7 @@ #include #include #include +#include #include "iceberg/result.h" #include "iceberg/schema.h" @@ -132,6 +133,7 @@ bool PartitionSpec::Equals(const PartitionSpec& other) const { Status PartitionSpec::Validate(const Schema& schema, bool allow_missing_fields) const { ICEBERG_RETURN_UNEXPECTED(ValidatePartitionName(schema, *this)); + ICEBERG_RETURN_UNEXPECTED(ValidateRedundantPartitions(*this)); std::unordered_map parents = IndexParents(schema); for (const auto& partition_field : fields_) { @@ -261,4 +263,25 @@ bool PartitionSpec::HasSequentialFieldIds(const PartitionSpec& spec) { return true; } +Status PartitionSpec::ValidateRedundantPartitions(const PartitionSpec& spec) { + // Use a map to track deduplication keys (source_id + transform dedup name) + std::unordered_map, const PartitionField*> dedup_fields; + + for (const auto& field : spec.fields()) { + // Create dedup key: (source_id, transform_dedup_name) + auto dedup_key = std::make_pair(field.source_id(), field.transform()->DedupName()); + + // Check if this dedup key already exists + auto existing_field_iter = dedup_fields.find(dedup_key); + ICEBERG_PRECHECK(existing_field_iter == dedup_fields.end(), + "Cannot add redundant partition: {} conflicts with {}", + field.ToString(), existing_field_iter->second->ToString()); + + // Add this field to the dedup map + dedup_fields[dedup_key] = &field; + } + + return {}; +} + } // namespace iceberg diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index 9aa1954ab..ae10dfccf 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -135,6 +135,9 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { /// \brief Compare two partition specs for equality. bool Equals(const PartitionSpec& other) const; + /// \brief Validates that there are no redundant partition fields in the spec. + static Status ValidateRedundantPartitions(const PartitionSpec& spec); + using SourceIdToFieldsMap = std::unordered_map>; static Result InitSourceIdToFieldsMap(const PartitionSpec&); diff --git a/src/iceberg/test/partition_spec_test.cc b/src/iceberg/test/partition_spec_test.cc index f9953a308..10cb98e4d 100644 --- a/src/iceberg/test/partition_spec_test.cc +++ b/src/iceberg/test/partition_spec_test.cc @@ -303,4 +303,127 @@ TEST(PartitionSpecTest, PartitionFieldInStructInMap) { EXPECT_THAT(result_value, HasErrorMessage("Invalid partition field parent")); } +// Test ValidateRedundantPartitions +TEST(PartitionSpecTest, ValidateRedundantPartitionsExactDuplicates) { + // Create a schema with different field types + auto ts_field = SchemaField::MakeRequired(1, "ts", timestamp()); + auto id_field = SchemaField::MakeRequired(2, "id", int64()); + Schema schema({ts_field, id_field}, Schema::kInitialSchemaId); + + // Test: exact duplicate transforms on same field (same dedup name) + { + PartitionField field1(1, 1000, "ts_day1", Transform::Day()); + PartitionField field2(1, 1001, "ts_day2", Transform::Day()); + + auto result = PartitionSpec::Make(schema, 1, {field1, field2}, false); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition")); + EXPECT_THAT(result, HasErrorMessage("conflicts with")); + } + + // Test: same bucket size on same field (redundant) + { + PartitionField bucket1(2, 1000, "id_bucket1", Transform::Bucket(16)); + PartitionField bucket2(2, 1001, "id_bucket2", Transform::Bucket(16)); + + auto result = PartitionSpec::Make(schema, 1, {bucket1, bucket2}, false); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition")); + } + + // Test: same truncate width on same field (redundant) + { + auto name_field = SchemaField::MakeRequired(3, "name", string()); + Schema schema_with_string({name_field}, Schema::kInitialSchemaId); + + PartitionField truncate1(3, 1000, "name_trunc1", Transform::Truncate(4)); + PartitionField truncate2(3, 1001, "name_trunc2", Transform::Truncate(4)); + + auto result = + PartitionSpec::Make(schema_with_string, 1, {truncate1, truncate2}, false); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition")); + } +} + +TEST(PartitionSpecTest, ValidateRedundantPartitionsAllowedCases) { + // Create a schema with different field types + auto ts_field = SchemaField::MakeRequired(1, "ts", timestamp()); + auto id_field = SchemaField::MakeRequired(2, "id", int64()); + auto name_field = SchemaField::MakeRequired(3, "name", string()); + Schema schema({ts_field, id_field, name_field}, Schema::kInitialSchemaId); + + // Test: different bucket sizes on same field (allowed - different dedup names) + { + PartitionField bucket16(2, 1000, "id_bucket16", Transform::Bucket(16)); + PartitionField bucket32(2, 1001, "id_bucket32", Transform::Bucket(32)); + + auto result = PartitionSpec::Make(schema, 1, {bucket16, bucket32}, false); + EXPECT_THAT(result, IsOk()); + } + + // Test: different truncate widths on same field (allowed - different dedup names) + { + PartitionField truncate4(3, 1000, "name_trunc4", Transform::Truncate(4)); + PartitionField truncate8(3, 1001, "name_trunc8", Transform::Truncate(8)); + + auto result = PartitionSpec::Make(schema, 1, {truncate4, truncate8}, false); + EXPECT_THAT(result, IsOk()); + } + + // Test: same transforms on different fields (allowed) + { + PartitionField ts_day(1, 1000, "ts_day", Transform::Day()); + PartitionField id_bucket(2, 1001, "id_bucket", Transform::Bucket(16)); + + auto result = PartitionSpec::Make(schema, 1, {ts_day, id_bucket}, false); + EXPECT_THAT(result, IsOk()); + } + + // Test: different transforms on same field (allowed if dedup names differ) + { + PartitionField ts_day(1, 1000, "ts_day", Transform::Day()); + PartitionField ts_month(1, 1001, "ts_month", Transform::Month()); + + // This should be allowed since Day and Month have different dedup names + // The Java logic only checks for exact dedup name matches + auto result = PartitionSpec::Make(schema, 1, {ts_day, ts_month}, false); + EXPECT_THAT(result, IsOk()); + } + + // Test: single partition field (no redundancy possible) + { + PartitionField single_field(1, 1000, "ts_year", Transform::Year()); + + auto result = PartitionSpec::Make(schema, 1, {single_field}, false); + EXPECT_THAT(result, IsOk()); + } +} + +TEST(PartitionSpecTest, ValidateRedundantPartitionsIdentityTransforms) { + // Create a schema with different field types + auto id_field = SchemaField::MakeRequired(1, "id", int64()); + auto name_field = SchemaField::MakeRequired(2, "name", string()); + Schema schema({id_field, name_field}, Schema::kInitialSchemaId); + + // Test: multiple identity transforms on same field (redundant) + { + PartitionField identity1(1, 1000, "id1", Transform::Identity()); + PartitionField identity2(1, 1001, "id2", Transform::Identity()); + + auto result = PartitionSpec::Make(schema, 1, {identity1, identity2}, false); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition")); + } + + // Test: identity transforms on different fields (allowed) + { + PartitionField id_identity(1, 1000, "id", Transform::Identity()); + PartitionField name_identity(2, 1001, "name", Transform::Identity()); + + auto result = PartitionSpec::Make(schema, 1, {id_identity, name_identity}, false); + EXPECT_THAT(result, IsOk()); + } +} + } // namespace iceberg diff --git a/src/iceberg/test/table_metadata_builder_test.cc b/src/iceberg/test/table_metadata_builder_test.cc index a912a08f5..4c551fea4 100644 --- a/src/iceberg/test/table_metadata_builder_test.cc +++ b/src/iceberg/test/table_metadata_builder_test.cc @@ -442,17 +442,18 @@ TEST(TableMetadataBuilderTest, AddSchemaBasic) { EXPECT_EQ(metadata->last_column_id, 6); } -TEST(TableMetadataBuilderTest, AddSchemaInvalidColumnIds) { +TEST(TableMetadataBuilderTest, AddSchemaWithDuplicateColumnIds) { auto base = CreateBaseMetadata(); auto builder = TableMetadataBuilder::BuildFrom(base.get()); - // Try to add schema with column ID lower than existing last_column_id + // Add schema with column ID that already exists - should still work auto field1 = SchemaField::MakeRequired(2, "duplicate_id", int64()); // ID 2 already exists - auto invalid_schema = std::make_shared(std::vector{field1}, 1); - builder->AddSchema(invalid_schema); + auto schema_with_duplicate = + std::make_shared(std::vector{field1}, 1); + builder->AddSchema(schema_with_duplicate); ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); - // Should still work - AddSchema automatically uses max(existing, highest_in_schema) + // Should work - AddSchema automatically uses max(existing, highest_in_schema) ASSERT_EQ(metadata->schemas.size(), 2); EXPECT_EQ(metadata->last_column_id, 3); // Should remain 3 (from base metadata) } @@ -483,6 +484,50 @@ TEST(TableMetadataBuilderTest, AddSchemaEmptyFields) { EXPECT_EQ(metadata->last_column_id, 3); // Should remain unchanged } +TEST(TableMetadataBuilderTest, AddSchemaIdempotent) { + auto base = CreateBaseMetadata(); + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add the same schema twice + auto field1 = SchemaField::MakeRequired(4, "field1", int64()); + auto field2 = SchemaField::MakeRequired(5, "field2", string()); + auto schema1 = std::make_shared(std::vector{field1, field2}, 1); + auto schema2 = std::make_shared(std::vector{field1, field2}, + 2); // Different ID but same fields + + builder->AddSchema(schema1); + builder->AddSchema(schema2); // Should reuse existing schema + + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + ASSERT_EQ(metadata->schemas.size(), 2); // Only one new schema should be added + EXPECT_EQ(metadata->schemas[1]->schema_id().value(), 1); +} + +TEST(TableMetadataBuilderTest, AddSchemaMultipleDifferent) { + auto base = CreateBaseMetadata(); + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add multiple different schemas + auto field1 = SchemaField::MakeRequired(4, "field1", int64()); + auto field2 = SchemaField::MakeRequired(5, "field2", string()); + auto field3 = SchemaField::MakeRequired(6, "field3", float64()); + + auto schema1 = std::make_shared(std::vector{field1}, 1); + auto schema2 = std::make_shared(std::vector{field2}, 2); + auto schema3 = std::make_shared(std::vector{field3}, 3); + + builder->AddSchema(schema1); + builder->AddSchema(schema2); + builder->AddSchema(schema3); + + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + ASSERT_EQ(metadata->schemas.size(), 4); // Original + 3 new + EXPECT_EQ(metadata->schemas[1]->schema_id().value(), 1); + EXPECT_EQ(metadata->schemas[2]->schema_id().value(), 2); + EXPECT_EQ(metadata->schemas[3]->schema_id().value(), 3); + EXPECT_EQ(metadata->last_column_id, 6); +} + // Test SetCurrentSchema TEST(TableMetadataBuilderTest, SetCurrentSchemaBasic) { auto base = CreateBaseMetadata(); @@ -548,6 +593,219 @@ TEST(TableMetadataBuilderTest, SetCurrentSchemaUpdatesLastColumnId) { EXPECT_EQ(metadata->last_column_id, 10); } +// Additional comprehensive tests for AddSchema +TEST(TableMetadataBuilderTest, AddSchemaWithNestedTypes) { + auto base = CreateBaseMetadata(); + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add schema with nested struct type + auto inner_field1 = SchemaField::MakeRequired(10, "inner_id", int64()); + auto inner_field2 = SchemaField::MakeRequired(11, "inner_name", string()); + auto struct_type = + std::make_shared(std::vector{inner_field1, inner_field2}); + auto struct_field = SchemaField::MakeRequired(4, "nested_struct", struct_type); + + auto new_schema = std::make_shared(std::vector{struct_field}, 1); + builder->AddSchema(new_schema); + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + + ASSERT_EQ(metadata->schemas.size(), 2); + EXPECT_EQ(metadata->schemas[1]->schema_id().value(), 1); + EXPECT_EQ(metadata->last_column_id, 11); // Should be highest nested field ID +} + +TEST(TableMetadataBuilderTest, AddSchemaWithListAndMapTypes) { + auto base = CreateBaseMetadata(); + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add schema with list type + auto list_type = std::make_shared(10, int32(), true); + auto list_field = SchemaField::MakeRequired(4, "int_list", list_type); + + // Add schema with map type + auto key_field = SchemaField::MakeRequired(11, "key", string()); + auto value_field = SchemaField::MakeRequired(12, "value", int64()); + auto map_type = std::make_shared(key_field, value_field); + auto map_field = SchemaField::MakeRequired(5, "string_int_map", map_type); + + auto new_schema = + std::make_shared(std::vector{list_field, map_field}, 1); + builder->AddSchema(new_schema); + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + + ASSERT_EQ(metadata->schemas.size(), 2); + EXPECT_EQ(metadata->schemas[1]->schema_id().value(), 1); + EXPECT_EQ(metadata->last_column_id, 12); // Should be highest field ID including nested +} + +TEST(TableMetadataBuilderTest, AddSchemaSequentialIds) { + auto base = CreateBaseMetadata(); + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add multiple schemas and verify ID assignment + auto field1 = SchemaField::MakeRequired(4, "field1", int64()); + auto schema1 = std::make_shared(std::vector{field1}, 1); + + auto field2 = SchemaField::MakeRequired(5, "field2", string()); + auto schema2 = std::make_shared(std::vector{field2}, 2); + + auto field3 = SchemaField::MakeRequired(6, "field3", float64()); + auto schema3 = std::make_shared(std::vector{field3}, 3); + + builder->AddSchema(schema1); + builder->AddSchema(schema2); + builder->AddSchema(schema3); + + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + ASSERT_EQ(metadata->schemas.size(), 4); // Original + 3 new + + // Verify sequential schema IDs + EXPECT_EQ(metadata->schemas[0]->schema_id().value(), 0); // Original + EXPECT_EQ(metadata->schemas[1]->schema_id().value(), 1); + EXPECT_EQ(metadata->schemas[2]->schema_id().value(), 2); + EXPECT_EQ(metadata->schemas[3]->schema_id().value(), 3); +} + +TEST(TableMetadataBuilderTest, AddSchemaWithOptionalFields) { + auto base = CreateBaseMetadata(); + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add schema with mix of required and optional fields + auto required_field = SchemaField::MakeRequired(4, "required_field", int64()); + auto optional_field = SchemaField::MakeOptional(5, "optional_field", string()); + + auto new_schema = std::make_shared( + std::vector{required_field, optional_field}, 1); + builder->AddSchema(new_schema); + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + + ASSERT_EQ(metadata->schemas.size(), 2); + EXPECT_EQ(metadata->schemas[1]->schema_id().value(), 1); + + // Verify field properties + const auto& fields = metadata->schemas[1]->fields(); + ASSERT_EQ(fields.size(), 2); + EXPECT_FALSE(fields[0].optional()); // required_field + EXPECT_TRUE(fields[1].optional()); // optional_field +} + +// Additional comprehensive tests for SetCurrentSchema +TEST(TableMetadataBuilderTest, SetCurrentSchemaInvalidId) { + auto base = CreateBaseMetadata(); + + // 1. Try to use -1 (last added) when no schema has been added + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + builder->SetCurrentSchema(-1); + ASSERT_THAT(builder->Build(), IsError(ErrorKind::kValidationFailed)); + ASSERT_THAT(builder->Build(), HasErrorMessage("no schema has been added")); + + // 2. Try to set non-existent schema ID + builder = TableMetadataBuilder::BuildFrom(base.get()); + builder->SetCurrentSchema(999); + ASSERT_THAT(builder->Build(), IsError(ErrorKind::kValidationFailed)); + ASSERT_THAT(builder->Build(), HasErrorMessage("unknown schema: 999")); +} + +TEST(TableMetadataBuilderTest, SetCurrentSchemaWithPartitionSpecRebuild) { + auto base = CreateBaseMetadata(); + + // Add a partition spec to the base metadata + auto schema = CreateTestSchema(); + ICEBERG_UNWRAP_OR_FAIL( + auto spec, + PartitionSpec::Make(PartitionSpec::kInitialSpecId, + {PartitionField(1, 1000, "id_bucket", Transform::Bucket(16))}, + 1000)); + base->partition_specs.push_back(std::move(spec)); + + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add and set a new schema - should rebuild partition specs + auto field1 = SchemaField::MakeRequired(4, "new_id", int64()); + auto field2 = SchemaField::MakeRequired(5, "new_data", string()); + auto new_schema = std::make_shared(std::vector{field1, field2}, 1); + builder->SetCurrentSchema(new_schema, 5); + + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + + // Verify schema was set + EXPECT_EQ(metadata->current_schema_id.value(), 1); + EXPECT_EQ(metadata->last_column_id, 5); + + // Verify partition specs were rebuilt (they should still exist) + ASSERT_EQ(metadata->partition_specs.size(), 2); +} + +TEST(TableMetadataBuilderTest, SetCurrentSchemaMultipleOperations) { + auto base = CreateBaseMetadata(); + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add multiple schemas + auto field1 = SchemaField::MakeRequired(4, "field1", int64()); + auto schema1 = std::make_shared(std::vector{field1}, 1); + + auto field2 = SchemaField::MakeRequired(5, "field2", string()); + auto schema2 = std::make_shared(std::vector{field2}, 2); + + builder->AddSchema(schema1); + builder->AddSchema(schema2); + + // Set current to first added schema + builder->SetCurrentSchema(1); + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + EXPECT_EQ(metadata->current_schema_id.value(), 1); + + // Change current to second schema + builder = TableMetadataBuilder::BuildFrom(metadata.get()); + builder->SetCurrentSchema(2); + ICEBERG_UNWRAP_OR_FAIL(metadata, builder->Build()); + EXPECT_EQ(metadata->current_schema_id.value(), 2); + + // Change back to original schema + builder = TableMetadataBuilder::BuildFrom(metadata.get()); + builder->SetCurrentSchema(0); + ICEBERG_UNWRAP_OR_FAIL(metadata, builder->Build()); + EXPECT_EQ(metadata->current_schema_id.value(), 0); +} + +TEST(TableMetadataBuilderTest, SetCurrentSchemaLastAddedTracking) { + auto base = CreateBaseMetadata(); + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add multiple schemas + auto field1 = SchemaField::MakeRequired(4, "field1", int64()); + auto schema1 = std::make_shared(std::vector{field1}, 1); + + auto field2 = SchemaField::MakeRequired(5, "field2", string()); + auto schema2 = std::make_shared(std::vector{field2}, 2); + + builder->AddSchema(schema1); + builder->AddSchema(schema2); // This becomes the "last added" + + // Use -1 to set current to last added schema + builder->SetCurrentSchema(-1); + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + EXPECT_EQ(metadata->current_schema_id.value(), 2); // Should be schema2 +} + +TEST(TableMetadataBuilderTest, AddSchemaAndSetCurrentCombined) { + auto base = CreateBaseMetadata(); + auto builder = TableMetadataBuilder::BuildFrom(base.get()); + + // Add a schema and immediately set it as current + auto field1 = SchemaField::MakeRequired(4, "new_field", int64()); + auto new_schema = std::make_shared(std::vector{field1}, 1); + + builder->AddSchema(new_schema); + builder->SetCurrentSchema(-1); // Set to last added + + ICEBERG_UNWRAP_OR_FAIL(auto metadata, builder->Build()); + ASSERT_EQ(metadata->schemas.size(), 2); + EXPECT_EQ(metadata->current_schema_id.value(), 1); + EXPECT_EQ(metadata->schemas[1]->schema_id().value(), 1); + EXPECT_EQ(metadata->last_column_id, 4); +} + TEST(TableMetadataBuilderTest, SetCurrentSchemaInvalid) { auto base = CreateBaseMetadata(); diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index 4c39d01e0..560cc3921 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -388,6 +388,8 @@ std::string Transform::ToString() const { std::unreachable(); } +std::string Transform::DedupName() const { return ToString(); } + Result Transform::GeneratePartitionName(std::string_view source_name) const { switch (transform_type_) { case TransformType::kIdentity: diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 47cf3ee91..1044e264f 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -197,6 +197,10 @@ class ICEBERG_EXPORT Transform : public util::Formattable { /// \brief Returns a string representation of this transform (e.g., "bucket[16]"). std::string ToString() const override; + /// \brief Return the unique transform name to check if similar transforms for the same + /// source field are added multiple times in partition spec builder. + std::string DedupName() const; + /// \brief Generates a partition name for the transform. /// \param source_name The name of the source column. /// \return A string representation of the partition name. From 7a34102aec65a6aa67c53a39eb5d3645b0c5ec96 Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Fri, 26 Dec 2025 23:47:05 +0800 Subject: [PATCH 2/3] fix build failure --- src/iceberg/partition_spec.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 97a318faa..00ab5b679 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -265,19 +266,21 @@ bool PartitionSpec::HasSequentialFieldIds(const PartitionSpec& spec) { Status PartitionSpec::ValidateRedundantPartitions(const PartitionSpec& spec) { // Use a map to track deduplication keys (source_id + transform dedup name) - std::unordered_map, const PartitionField*> dedup_fields; + std::map, const PartitionField*> dedup_fields; for (const auto& field : spec.fields()) { - // Create dedup key: (source_id, transform_dedup_name) + // The dedup name is provided by the transform's DedupName() method + // which typically returns the transform's string representation auto dedup_key = std::make_pair(field.source_id(), field.transform()->DedupName()); // Check if this dedup key already exists + // If it does, we have found a redundant partition field auto existing_field_iter = dedup_fields.find(dedup_key); ICEBERG_PRECHECK(existing_field_iter == dedup_fields.end(), "Cannot add redundant partition: {} conflicts with {}", field.ToString(), existing_field_iter->second->ToString()); - // Add this field to the dedup map + // Add this field to the dedup map for future conflict detection dedup_fields[dedup_key] = &field; } From 14a04bad0389a067b34abc235b4df7c457a1764c Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Sat, 27 Dec 2025 12:39:44 +0800 Subject: [PATCH 3/3] fix test failure --- src/iceberg/partition_spec.cc | 21 +++++++-------- .../test/update_partition_spec_test.cc | 26 ++++++++++--------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 00ab5b679..f533ef88d 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -187,11 +187,10 @@ Status PartitionSpec::ValidatePartitionName(const Schema& schema, std::unordered_set partition_names; for (const auto& partition_field : spec.fields()) { auto name = std::string(partition_field.name()); - ICEBERG_PRECHECK(!name.empty(), "Cannot use empty partition name: {}", name); + ICEBERG_CHECK(!name.empty(), "Cannot use empty partition name: {}", name); - if (partition_names.contains(name)) { - return InvalidArgument("Cannot use partition name more than once: {}", name); - } + ICEBERG_PRECHECK(!partition_names.contains(name), + "Cannot use partition name more than once: {}", name); partition_names.insert(name); ICEBERG_ASSIGN_OR_RAISE(auto schema_field, schema.FindFieldByName(name)); @@ -202,17 +201,15 @@ Status PartitionSpec::ValidatePartitionName(const Schema& schema, // field name as long as they are sourced from the same schema field if (schema_field.has_value() && schema_field.value().get().field_id() != partition_field.source_id()) { - return InvalidArgument( + return ValidationFailed( "Cannot create identity partition sourced from different field in schema: {}", name); } } else { // for all other transforms we don't allow conflicts between partition name and // schema field name - if (schema_field.has_value()) { - return InvalidArgument( - "Cannot create partition from name that exists in schema: {}", name); - } + ICEBERG_CHECK(!schema_field.has_value(), + "Cannot create partition from name that exists in schema: {}", name); } } @@ -276,9 +273,9 @@ Status PartitionSpec::ValidateRedundantPartitions(const PartitionSpec& spec) { // Check if this dedup key already exists // If it does, we have found a redundant partition field auto existing_field_iter = dedup_fields.find(dedup_key); - ICEBERG_PRECHECK(existing_field_iter == dedup_fields.end(), - "Cannot add redundant partition: {} conflicts with {}", - field.ToString(), existing_field_iter->second->ToString()); + ICEBERG_CHECK(existing_field_iter == dedup_fields.end(), + "Cannot add redundant partition: {} conflicts with {}", + field.ToString(), existing_field_iter->second->ToString()); // Add this field to the dedup map for future conflict detection dedup_fields[dedup_key] = &field; diff --git a/src/iceberg/test/update_partition_spec_test.cc b/src/iceberg/test/update_partition_spec_test.cc index 31dbbae64..4da8f448d 100644 --- a/src/iceberg/test/update_partition_spec_test.cc +++ b/src/iceberg/test/update_partition_spec_test.cc @@ -729,39 +729,41 @@ TEST_P(UpdatePartitionSpecTest, TestRemoveAndAddMultiTimes) { update3->AddField(Expressions::Day("ts"), "ts_date"); auto add_second_time_spec = ApplyUpdateAndGetSpec(update3); - // Remove second time - auto table3 = CreateTableWithSpec(add_second_time_spec, "test_table3"); - auto update4 = CreateUpdateFromTable(table3); - update4->RemoveField(Expressions::Day("ts")); - auto remove_second_time_spec = ApplyUpdateAndGetSpec(update4); - // Add third time with month - auto table4 = CreateTableWithSpec(remove_second_time_spec, "test_table4"); + auto table4 = CreateTableWithSpec(add_second_time_spec, "test_table4"); auto update5 = CreateUpdateFromTable(table4); - update5->AddField(Expressions::Month("ts")); + update5->AddField(Expressions::Month("ts"), "ts_month"); auto add_third_time_spec = ApplyUpdateAndGetSpec(update5); // Rename ts_month to ts_date auto table5 = CreateTableWithSpec(add_third_time_spec, "test_table5"); auto update6 = CreateUpdateFromTable(table5); - update6->RenameField("ts_month", "ts_date"); + update6->RenameField("ts_month", "__ts_date"); auto updated_spec = ApplyUpdateAndGetSpec(update6); if (format_version_ == 1) { + // Remove second time + auto table3 = CreateTableWithSpec(add_second_time_spec, "test_table3"); + auto update4 = CreateUpdateFromTable(table3); + update4->RemoveField(Expressions::Day("ts")); + ExpectError(update4, ErrorKind::kValidationFailed, + "Cannot add redundant partition: ts_date"); + ASSERT_EQ(updated_spec->fields().size(), 3); // In V1, we expect void transforms for deleted fields EXPECT_TRUE(updated_spec->fields()[0].name().find("ts_date") == 0); EXPECT_TRUE(updated_spec->fields()[1].name().find("ts_date") == 0); - EXPECT_EQ(updated_spec->fields()[2].name(), "ts_date"); + EXPECT_EQ(updated_spec->fields()[2].name(), "__ts_date"); EXPECT_EQ(*updated_spec->fields()[0].transform(), *Transform::Void()); - EXPECT_EQ(*updated_spec->fields()[1].transform(), *Transform::Void()); + EXPECT_EQ(*updated_spec->fields()[1].transform(), *Transform::Day()); EXPECT_EQ(*updated_spec->fields()[2].transform(), *Transform::Month()); } else { ICEBERG_UNWRAP_OR_FAIL( auto expected_spec, PartitionSpec::Make(PartitionSpec::kInitialSpecId, std::vector{ - PartitionField(2, 1000, "ts_date", Transform::Month())}, + PartitionField(2, 1000, "ts_date", Transform::Day()), + PartitionField(2, 1001, "__ts_date", Transform::Month())}, 1000)); auto expected = std::shared_ptr(expected_spec.release()); AssertPartitionSpecEquals(*expected, *updated_spec);