Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,27 @@ public class RESTCatalogOptions {
.stringType()
.noDefaultValue()
.withDescription("REST Catalog DLF OSS endpoint.");

public static final ConfigOption<Boolean> IO_CACHE_ENABLED =
ConfigOptions.key("io-cache.enabled")
.booleanType()
.defaultValue(false)
.withDescription(
"Enable cache for visiting files using file io (currently only JindoFileIO supports cache).");
public static final ConfigOption<String> IO_CACHE_WHITELIST_PATH =
ConfigOptions.key("io-cache.whitelist-path")
.stringType()
.defaultValue("bucket-,manifest")
.withDescription(
"Cache is only applied to paths which contain the specified pattern, and * means all paths.");
public static final ConfigOption<String> IO_CACHE_POLICY =
ConfigOptions.key("io-cache.policy")
.stringType()
.noDefaultValue()
.withDescription(
"The table-level cache policy provided by the REST server, combined with: meta,read,write."
+ "`meta`: meta cache is enabled for visiting files; "
+ "`read`: cache is enabled when reading files; "
+ "`write`: data is also cached when writing files; "
+ "`none`: cache is all disabled.");
}
35 changes: 35 additions & 0 deletions paimon-filesystems/paimon-jindo/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,40 @@
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.2</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.3</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.paimon.jindo;

import org.apache.paimon.catalog.CatalogContext;
import org.apache.paimon.fs.FileIO;
import org.apache.paimon.fs.FileStatus;
import org.apache.paimon.fs.Path;
Expand All @@ -27,31 +28,107 @@
import org.apache.paimon.fs.VectoredReadable;
import org.apache.paimon.utils.Pair;

import org.apache.paimon.shade.guava30.com.google.common.collect.Lists;

import com.aliyun.jindodata.common.JindoHadoopSystem;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import static org.apache.paimon.rest.RESTCatalogOptions.IO_CACHE_ENABLED;
import static org.apache.paimon.rest.RESTCatalogOptions.IO_CACHE_POLICY;
import static org.apache.paimon.rest.RESTCatalogOptions.IO_CACHE_WHITELIST_PATH;

/**
* Hadoop {@link FileIO}.
*
* <p>Important: copy this class from HadoopFileIO here to avoid class loader conflicts.
*/
public abstract class HadoopCompliantFileIO implements FileIO {
private static final Logger LOG = LoggerFactory.getLogger(HadoopCompliantFileIO.class);

private static final long serialVersionUID = 1L;

/// Detailed cache strategies are retrieved from REST server.
private static final String META_CACHE_ENABLED_TAG = "meta";
private static final String READ_CACHE_ENABLED_TAG = "read";
private static final String WRITE_CACHE_ENABLED_TAG = "write";
private static final String DISABLE_CACHE_TAG = "none";

protected boolean metaCacheEnabled = false;
protected boolean readCacheEnabled = false;
protected boolean writeCacheEnabled = false;

protected transient volatile Map<String, Pair<JindoHadoopSystem, String>> fsMap;
protected transient volatile Map<String, Pair<JindoHadoopSystem, String>> jindoCacheFsMap;

// Only enable cache for path which is generated with uuid
private List<String> cacheWhitelistPaths = new ArrayList<>();

boolean shouldCache(Path path) {
if (cacheWhitelistPaths.isEmpty()) {
return true;
}
String pathStr = path.toUri().getPath();
for (String pattern : cacheWhitelistPaths) {
if (pathStr.contains(pattern)) {
return true;
}
}
return false;
}

@Override
public void configure(CatalogContext context) {
// Process file io cache configuration
if (!context.options().get(IO_CACHE_ENABLED)
|| context.options().get(IO_CACHE_POLICY) == null
|| context.options().get(IO_CACHE_POLICY).contains(DISABLE_CACHE_TAG)) {
LOG.debug(
"Cache is disabled with io-cache.enabled={}, io-cache.policy={}",
context.options().get(IO_CACHE_ENABLED),
context.options().get(IO_CACHE_POLICY));
return;
}
// Enable file io cache
if (context.options().get("fs.jindocache.namespace.rpc.address") == null) {
LOG.info(
"FileIO cache is enabled but JindoCache RPC address is not set, fallback to no-cache");
} else {
metaCacheEnabled =
context.options().get(IO_CACHE_POLICY).contains(META_CACHE_ENABLED_TAG);
readCacheEnabled =
context.options().get(IO_CACHE_POLICY).contains(READ_CACHE_ENABLED_TAG);
writeCacheEnabled =
context.options().get(IO_CACHE_POLICY).contains(WRITE_CACHE_ENABLED_TAG);
String whitelist = context.options().get(IO_CACHE_WHITELIST_PATH);
if (!whitelist.equals("*")) {
cacheWhitelistPaths = Lists.newArrayList(whitelist.split(","));
}
LOG.info(
"Cache enabled with cache policy: meta cache enabled {}, read cache enabled {}, write cache enabled {}, whitelist path: {}",
metaCacheEnabled,
readCacheEnabled,
writeCacheEnabled,
whitelist);
}
}

@Override
public SeekableInputStream newInputStream(Path path) throws IOException {
org.apache.hadoop.fs.Path hadoopPath = path(path);
Pair<JindoHadoopSystem, String> pair = getFileSystemPair(hadoopPath);
boolean shouldCache = readCacheEnabled && shouldCache(path);
LOG.debug("InputStream should cache {} for path {}", shouldCache, path);
Pair<JindoHadoopSystem, String> pair = getFileSystemPair(hadoopPath, shouldCache);
JindoHadoopSystem fs = pair.getKey();
String sysType = pair.getValue();
FSDataInputStream fsInput = fs.open(hadoopPath);
Expand All @@ -63,22 +140,27 @@ public SeekableInputStream newInputStream(Path path) throws IOException {
@Override
public PositionOutputStream newOutputStream(Path path, boolean overwrite) throws IOException {
org.apache.hadoop.fs.Path hadoopPath = path(path);
boolean shouldCache = writeCacheEnabled && shouldCache(path);
LOG.debug("OutputStream should cache {} for path {}", shouldCache, path);
return new HadoopPositionOutputStream(
getFileSystem(hadoopPath).create(hadoopPath, overwrite));
getFileSystem(hadoopPath, shouldCache).create(hadoopPath, overwrite));
}

@Override
public FileStatus getFileStatus(Path path) throws IOException {
org.apache.hadoop.fs.Path hadoopPath = path(path);
return new HadoopFileStatus(getFileSystem(hadoopPath).getFileStatus(hadoopPath));
boolean shouldCache = metaCacheEnabled && shouldCache(path);
LOG.debug("GetFileStatus should cache {} for path {}", shouldCache, path);
return new HadoopFileStatus(
getFileSystem(hadoopPath, shouldCache).getFileStatus(hadoopPath));
}

@Override
public FileStatus[] listStatus(Path path) throws IOException {
org.apache.hadoop.fs.Path hadoopPath = path(path);
FileStatus[] statuses = new FileStatus[0];
org.apache.hadoop.fs.FileStatus[] hadoopStatuses =
getFileSystem(hadoopPath).listStatus(hadoopPath);
getFileSystem(hadoopPath, false).listStatus(hadoopPath);
if (hadoopStatuses != null) {
statuses = new FileStatus[hadoopStatuses.length];
for (int i = 0; i < hadoopStatuses.length; i++) {
Expand All @@ -93,7 +175,7 @@ public RemoteIterator<FileStatus> listFilesIterative(Path path, boolean recursiv
throws IOException {
org.apache.hadoop.fs.Path hadoopPath = path(path);
org.apache.hadoop.fs.RemoteIterator<org.apache.hadoop.fs.LocatedFileStatus> hadoopIter =
getFileSystem(hadoopPath).listFiles(hadoopPath, recursive);
getFileSystem(hadoopPath, false).listFiles(hadoopPath, recursive);
return new RemoteIterator<FileStatus>() {
@Override
public boolean hasNext() throws IOException {
Expand All @@ -111,26 +193,28 @@ public FileStatus next() throws IOException {
@Override
public boolean exists(Path path) throws IOException {
org.apache.hadoop.fs.Path hadoopPath = path(path);
return getFileSystem(hadoopPath).exists(hadoopPath);
boolean shouldCache = metaCacheEnabled && shouldCache(path);
LOG.debug("Exists should cache {} for path {}", shouldCache, path);
return getFileSystem(hadoopPath, shouldCache).exists(hadoopPath);
}

@Override
public boolean delete(Path path, boolean recursive) throws IOException {
org.apache.hadoop.fs.Path hadoopPath = path(path);
return getFileSystem(hadoopPath).delete(hadoopPath, recursive);
return getFileSystem(hadoopPath, false).delete(hadoopPath, recursive);
}

@Override
public boolean mkdirs(Path path) throws IOException {
org.apache.hadoop.fs.Path hadoopPath = path(path);
return getFileSystem(hadoopPath).mkdirs(hadoopPath);
return getFileSystem(hadoopPath, false).mkdirs(hadoopPath);
}

@Override
public boolean rename(Path src, Path dst) throws IOException {
org.apache.hadoop.fs.Path hadoopSrc = path(src);
org.apache.hadoop.fs.Path hadoopDst = path(dst);
return getFileSystem(hadoopSrc).rename(hadoopSrc, hadoopDst);
return getFileSystem(hadoopSrc, false).rename(hadoopSrc, hadoopDst);
}

protected org.apache.hadoop.fs.Path path(Path path) {
Expand All @@ -141,22 +225,34 @@ protected org.apache.hadoop.fs.Path path(Path path) {
return new org.apache.hadoop.fs.Path(path.toUri());
}

protected JindoHadoopSystem getFileSystem(org.apache.hadoop.fs.Path path) throws IOException {
return getFileSystemPair(path).getKey();
protected JindoHadoopSystem getFileSystem(org.apache.hadoop.fs.Path path, boolean enableCache)
throws IOException {
return getFileSystemPair(path, enableCache).getKey();
}

protected Pair<JindoHadoopSystem, String> getFileSystemPair(org.apache.hadoop.fs.Path path)
throws IOException {
if (fsMap == null) {
synchronized (this) {
if (fsMap == null) {
fsMap = new ConcurrentHashMap<>();
protected Pair<JindoHadoopSystem, String> getFileSystemPair(
org.apache.hadoop.fs.Path path, boolean enableCache) throws IOException {
Map<String, Pair<JindoHadoopSystem, String>> map;
if (enableCache) {
if (jindoCacheFsMap == null) {
synchronized (this) {
if (jindoCacheFsMap == null) {
jindoCacheFsMap = new ConcurrentHashMap<>();
}
}
}
map = jindoCacheFsMap;
} else {
if (fsMap == null) {
synchronized (this) {
if (fsMap == null) {
fsMap = new ConcurrentHashMap<>();
}
}
}
map = fsMap;
}

Map<String, Pair<JindoHadoopSystem, String>> map = fsMap;

String authority = path.toUri().getAuthority();
if (authority == null) {
authority = "DEFAULT";
Expand All @@ -166,7 +262,7 @@ protected Pair<JindoHadoopSystem, String> getFileSystemPair(org.apache.hadoop.fs
authority,
k -> {
try {
return createFileSystem(path);
return createFileSystem(path, enableCache);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
Expand All @@ -177,7 +273,7 @@ protected Pair<JindoHadoopSystem, String> getFileSystemPair(org.apache.hadoop.fs
}

protected abstract Pair<JindoHadoopSystem, String> createFileSystem(
org.apache.hadoop.fs.Path path) throws IOException;
org.apache.hadoop.fs.Path path, boolean enableCache) throws IOException;

private static class HadoopSeekableInputStream extends SeekableInputStream {

Expand Down
Loading