diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/BaseFileUpdatesExtractor.java b/xtable-core/src/main/java/org/apache/xtable/hudi/BaseFileUpdatesExtractor.java index 325fc8a53..5a6b6a9a4 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/BaseFileUpdatesExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/BaseFileUpdatesExtractor.java @@ -90,8 +90,9 @@ ReplaceMetadata extractSnapshotChanges( // can be dropped Set partitionPathsToDrop = new HashSet<>( - FSUtils.getAllPartitionPaths( - engineContext, metadataConfig, metaClient.getBasePathV2().toString())); + HudiPathUtils.filterMetadataPaths( + FSUtils.getAllPartitionPaths( + engineContext, metadataConfig, metaClient.getBasePathV2().toString()))); ReplaceMetadata replaceMetadata = partitionedDataFiles.stream() .map( diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiDataFileExtractor.java b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiDataFileExtractor.java index 5e17b389f..f164d9217 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiDataFileExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiDataFileExtractor.java @@ -115,7 +115,8 @@ public List getFilesCurrentState(InternalTable table) { tableMetadata != null ? tableMetadata.getAllPartitionPaths() : FSUtils.getAllPartitionPaths(engineContext, metadataConfig, basePath.toString()); - return getInternalDataFilesForPartitions(allPartitionPaths, table); + return getInternalDataFilesForPartitions( + HudiPathUtils.filterMetadataPaths(allPartitionPaths), table); } catch (IOException ex) { throw new ReadException( "Unable to read partitions for table " + metaClient.getTableConfig().getTableName(), ex); diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java index 545f32150..62db51832 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiPathUtils.java @@ -18,9 +18,19 @@ package org.apache.xtable.hudi; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + import org.apache.hadoop.fs.Path; public class HudiPathUtils { + + private static final Set METADATA_DIR_NAMES = + new HashSet<>(Arrays.asList(".hoodie", "_delta_log")); + public static String getPartitionPath(Path tableBasePath, Path filePath) { String fileName = filePath.getName(); String pathStr = filePath.toUri().getPath(); @@ -28,4 +38,18 @@ public static String getPartitionPath(Path tableBasePath, Path filePath) { int endIndex = pathStr.length() - fileName.length() - 1; return endIndex <= startIndex ? "" : pathStr.substring(startIndex, endIndex); } + + /** Filters out known metadata directory paths like _delta_log and .hoodie. */ + public static List filterMetadataPaths(List partitionPaths) { + return partitionPaths.stream() + .filter( + p -> { + if (p.isEmpty()) { + return true; + } + String name = p.substring(p.lastIndexOf('/') + 1); + return !METADATA_DIR_NAMES.contains(name); + }) + .collect(Collectors.toList()); + } } diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/catalog/HudiCatalogPartitionSyncTool.java b/xtable-core/src/main/java/org/apache/xtable/hudi/catalog/HudiCatalogPartitionSyncTool.java index 792c70635..8c5003a2c 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/catalog/HudiCatalogPartitionSyncTool.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/catalog/HudiCatalogPartitionSyncTool.java @@ -49,6 +49,7 @@ import org.apache.xtable.catalog.CatalogPartitionSyncOperations; import org.apache.xtable.catalog.CatalogPartitionSyncTool; import org.apache.xtable.exception.CatalogSyncException; +import org.apache.xtable.hudi.HudiPathUtils; import org.apache.xtable.hudi.HudiTableManager; import org.apache.xtable.model.InternalTable; import org.apache.xtable.model.catalog.CatalogTableIdentifier; @@ -213,7 +214,8 @@ private void updateLastCommitTimeSynced( public List getAllPartitionPathsOnStorage(String basePath) { HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(configuration); // ToDo - if we need to config to validate assumeDatePartitioning - return FSUtils.getAllPartitionPaths(engineContext, basePath, true, false); + return HudiPathUtils.filterMetadataPaths( + FSUtils.getAllPartitionPaths(engineContext, basePath, true, false)); } public List getWrittenPartitionsSince( diff --git a/xtable-core/src/test/java/org/apache/xtable/hudi/TestHudiPathUtils.java b/xtable-core/src/test/java/org/apache/xtable/hudi/TestHudiPathUtils.java new file mode 100644 index 000000000..376a8dccd --- /dev/null +++ b/xtable-core/src/test/java/org/apache/xtable/hudi/TestHudiPathUtils.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.hudi; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.junit.jupiter.api.Test; + +public class TestHudiPathUtils { + + @Test + void filterMetadataPaths_removesKnownMetadataDirs() { + List input = Arrays.asList("region=US", "_delta_log", ".hoodie", "year=2024"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Arrays.asList("region=US", "year=2024"), result); + } + + @Test + void filterMetadataPaths_nestedMetadataDirAsLastSegment() { + List input = Arrays.asList("year=2024/_delta_log", "region=US/.hoodie"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Collections.emptyList(), result); + } + + @Test + void filterMetadataPaths_keepsEmptyString() { + List input = Arrays.asList("", "region=US"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Arrays.asList("", "region=US"), result); + } + + @Test + void filterMetadataPaths_keepsPartitionStartingWithUnderscore() { + List input = Arrays.asList("_status=active", "_year=2024", "region=US"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Arrays.asList("_status=active", "_year=2024", "region=US"), result); + } + + @Test + void filterMetadataPaths_keepsPartitionStartingWithDot() { + List input = Arrays.asList(".version=1", "region=US"); + List result = HudiPathUtils.filterMetadataPaths(input); + assertEquals(Arrays.asList(".version=1", "region=US"), result); + } +} \ No newline at end of file diff --git a/xtable-core/src/test/java/org/apache/xtable/hudi/catalog/TestHudiCatalogPartitionSyncTool.java b/xtable-core/src/test/java/org/apache/xtable/hudi/catalog/TestHudiCatalogPartitionSyncTool.java index 0c33013a5..aae876535 100644 --- a/xtable-core/src/test/java/org/apache/xtable/hudi/catalog/TestHudiCatalogPartitionSyncTool.java +++ b/xtable-core/src/test/java/org/apache/xtable/hudi/catalog/TestHudiCatalogPartitionSyncTool.java @@ -120,6 +120,21 @@ private void setupCommonMocks() { mockHudiCatalogPartitionSyncTool = createMockHudiPartitionSyncTool(); } + @Test + void testGetAllPartitionPathsOnStorageFiltersMetadataDirs() { + setupCommonMocks(); + try (MockedStatic mockFSUtils = mockStatic(FSUtils.class)) { + mockFSUtils + .when(() -> FSUtils.getAllPartitionPaths(any(), eq(TEST_BASE_PATH), eq(true), eq(false))) + .thenReturn(Arrays.asList("key1", "_delta_log", ".hoodie", "key2")); + + List result = + mockHudiCatalogPartitionSyncTool.getAllPartitionPathsOnStorage(TEST_BASE_PATH); + + assertEquals(Arrays.asList("key1", "key2"), result); + } + } + @SneakyThrows @Test void testSyncAllPartitions() {