diff --git a/Makefile b/Makefile index 1decc8b01b..cbd6007a22 100644 --- a/Makefile +++ b/Makefile @@ -61,8 +61,8 @@ quoted_perl_command = $(subst ','\'',$(perl_command)) # `make install-shared`, `make static_lib`, `make install-static` or # `make install` -# Set the default DEBUG_LEVEL to 1 -DEBUG_LEVEL?=1 +# Set the default DEBUG_LEVEL to 2 +DEBUG_LEVEL?=2 # OBJ_DIR is where the object files reside. Default to the current directory OBJ_DIR?=. @@ -120,7 +120,7 @@ endif ifeq (${DISABLE_JEMALLOC},1) ifeq (${ROCKSDB_DISABLE_JEMALLOC},) export_ROCKSDB_DISABLE_JEMALLOC := export ROCKSDB_DISABLE_JEMALLOC=1; - export ROCKSDB_DISABLE_JEMALLOC = 1 + export ROCKSDB_DISABLE_JEMALLOC = 1 endif endif @@ -244,9 +244,15 @@ endif # interfaces/internal abstractions, like in the iterator hierarchy. It works # better when combined with profile-guided optimizations (not currently # supported natively in Makefile). +OPTION_jemalloc := jemalloc-$(if $(filter 1,${DISABLE_JEMALLOC}),0,1) +OPTION_dyna_tls := dyna_tls-$(if $(filter 1,${TOPLING_USE_DYNAMIC_TLS}),1,0) +OPTION_lto := lto-0 ifeq ($(USE_LTO), 1) - CXXFLAGS += -flto - LDFLAGS += -flto=auto -fuse-linker-plugin + ifeq (${DEBUG_LEVEL},0) + CXXFLAGS += -flto + LDFLAGS += -flto=auto -fuse-linker-plugin + OPTION_lto := lto-$(if $(filter 1,${USE_LTO}),1,0) + endif endif # `COERCE_CONTEXT_SWITCH=1` will inject spurious wakeup and @@ -389,7 +395,7 @@ endif TOPLING_LIB_OBJECTS = $(addprefix ${TOPLING_CORE_DIR}/, ${TOPLING_LIB_OBJ_LIST_VAR}) LDFLAGS += ${TOPLING_CORE_LD_LIBS_EXTRA} -ifneq ($(filter auto_all_tests check check_0 watch-log gen_parallel_tests %_test %_test2 jtest, $(MAKECMDGOALS)),) +ifneq ($(filter auto_all_tests check check_0 watch-log gen_parallel_tests %_test %_test2 jtest %_test.o %_test2.o, $(MAKECMDGOALS)),) MAKE_UNIT_TEST ?= 1 endif ifeq (${MAKE_UNIT_TEST},1) @@ -406,6 +412,15 @@ endif ORIG_OBJ_DIR := ${OBJ_DIR} OBJ_DIR := ${BUILD_PREFIX}${OBJ_DIR}/v${ROCKSDB_FULL_VERSION} +# COMPILER is in ignored +TRIAL_urldir := toplingdb/gpl-trial/${OPTION_lto}-${OPTION_jemalloc}-${OPTION_dyna_tls}/${UNAME_MachineSystem}-bmi2-${WITH_BMI2}/${BUILD_TYPE_SIG} +ifeq (${PLATFORM},OS_LINUX) +LINUX_NAME := $(shell source /etc/os-release; echo $$ID) +ifeq (${LINUX_NAME},centos) +TRIAL_urldir := ${TRIAL_urldir}/$(shell source /etc/os-release; echo $$ID$$VERSION_ID) +endif +endif + # 1. we define ROCKSDB_DISABLE_ZSTD=1 on build_detect_platform. # 2. zstd lib is included in libterark-zbs # 3. we alway use ZSTD @@ -432,33 +447,24 @@ ifndef WITH_TOPLING_ROCKS cd topling-rocks; \ git submodule update --init --recursive \ ) - endif - ifeq (,$(wildcard sideplugin/topling-rocks)) - WITH_TOPLING_ROCKS := 0 else - WITH_TOPLING_ROCKS := 1 + ifeq (,$(wildcard sideplugin/topling-rocks/src/table/top_patent_algo.cc)) + dummy := $(shell rm -rf sideplugin/topling-rocks) + endif endif + # default 1 + WITH_TOPLING_ROCKS := 1 endif ifeq (${WITH_TOPLING_ROCKS},1) -ifeq (,$(wildcard sideplugin/topling-rocks)) - # topling specific: just for people who has permission to topling-rocks - dummy := $(shell set -e -x; \ - cd sideplugin; \ - git clone ${GIT_TOPLING_ROCKS}; \ - cd topling-rocks; \ - git submodule update --init --recursive \ - ) -else +ifneq (,$(wildcard sideplugin/topling-rocks)) ifneq (${UPDATE_REPO},0) ifeq (${MAKE_RESTARTS},) dummy := $(shell set -ex; cd sideplugin/topling-rocks && git pull) endif endif endif -ifeq (,$(wildcard sideplugin/topling-rocks/src/table/top_zip_table_builder.cc)) - $(error WITH_TOPLING_ROCKS=1 but repo sideplugin/topling-rocks is broken) -endif +CXXFLAGS += -DHAS_TOPLING_ROCKS endif ifeq (,$(wildcard sideplugin/cspp-memtable)) @@ -611,13 +617,14 @@ endif # WITH_TOPLING_DCOMPACT ifeq (${WITH_TOPLING_ROCKS},1) ifneq (,$(wildcard sideplugin/topling-rocks)) CXXFLAGS += -Isideplugin/topling-rocks/src - CXXFLAGS += -DHAS_TOPLING_ROCKS TOPLING_ROCKS_GIT_VER_SRC = ${BUILD_ROOT}/git-version-topling_rocks.cc - EXTRA_LIB_SOURCES += \ - $(wildcard sideplugin/topling-rocks/src/table/*.cc) \ - sideplugin/topling-rocks/${TOPLING_ROCKS_GIT_VER_SRC} -else - $(warning NotFound sideplugin/topling-rocks, this is ok, only ToplingZipTable is disabled) + ifeq (,${TOPLING_ZIP_TABLE_TRIAL_DAYS}) + EXTRA_LIB_SOURCES += $(wildcard sideplugin/topling-rocks/src/table/*.cc) + EXTRA_LIB_SOURCES += sideplugin/topling-rocks/${TOPLING_ROCKS_GIT_VER_SRC} + else + # no TOPLING_ROCKS_GIT_VER_SRC + EXTRA_LIB_SOURCES += sideplugin/topling-zip_table_reader/top_zip_table_builder.cc + endif endif endif @@ -785,7 +792,7 @@ endif ifeq ($(LIB_MODE),shared) # So that binaries are executable from build location, in addition to install location -EXEC_LDFLAGS += -Wl,-rpath -Wl,'$$ORIGIN' +EXEC_LDFLAGS += -Wl,-rpath -Wl,'$$ORIGIN:$$ORIGIN/../lib' endif ifeq ($(PLATFORM), OS_MACOSX) @@ -1033,6 +1040,9 @@ endif # topling specific WARNING_FLAGS WARNING_FLAGS := -Wall -Wno-shadow ifeq "$(shell a=${COMPILER};echo $${a:0:5})" "clang" + CXXFLAGS := $(patsubst -flto, -flto=thin, ${CXXFLAGS}) + LLD_LTO_FLAGS := -fuse-ld=lld -flto=thin -Wl,--thinlto-jobs=all + LDFLAGS := $(patsubst -flto=auto, ${LLD_LTO_FLAGS}, ${LDFLAGS}) LDFLAGS += -latomic #$(error LDFLAGS = ${LDFLAGS}) WARNING_FLAGS += -Wno-deprecated-builtins @@ -1100,6 +1110,12 @@ ifneq ($(PPC_LIBC_IS_GNU),0) LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(RANGE_TREE_SOURCES)) endif +ifeq (${WITH_TOPLING_ROCKS},1) + ifeq (,$(wildcard sideplugin/topling-rocks)) + LIB_OBJECTS += $(OBJ_DIR)/sideplugin/topling-zip_table_reader/top_zip_table_builder.o + endif +endif + GTEST = $(OBJ_DIR)/$(GTEST_DIR)/gtest/gtest-all.o TESTUTIL = $(OBJ_DIR)/test_util/testutil.o TESTHARNESS = $(OBJ_DIR)/test_util/testharness.o $(TESTUTIL) $(GTEST) @@ -2592,39 +2608,44 @@ install-headers: gen-pc install -d $(INSTALL_LIBDIR) install -d $(INSTALL_LIBDIR)/pkgconfig for header_dir in `$(FIND) "include/rocksdb" -type d`; do \ - install -d $(DESTDIR)/$(PREFIX)/$$header_dir; \ + install -d $(DESTDIR)$(PREFIX)/$$header_dir; \ done for header in `$(FIND) "include/rocksdb" -type f -name *.h`; do \ - install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/$$header; \ + install -C -m 644 $$header $(DESTDIR)$(PREFIX)/$$header; \ done for header in $(ROCKSDB_PLUGIN_HEADERS); do \ - install -d $(DESTDIR)/$(PREFIX)/include/rocksdb/`dirname $$header`; \ - install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/include/rocksdb/$$header; \ + install -d $(DESTDIR)$(PREFIX)/include/rocksdb/`dirname $$header`; \ + install -C -m 644 $$header $(DESTDIR)$(PREFIX)/include/rocksdb/$$header; \ done - install -d $(DESTDIR)/$(PREFIX)/include/topling - install -C -m 644 sideplugin/rockside/src/topling/json.h $(DESTDIR)/$(PREFIX)/include/topling - install -C -m 644 sideplugin/rockside/src/topling/json_fwd.h $(DESTDIR)/$(PREFIX)/include/topling - install -C -m 644 sideplugin/rockside/src/topling/builtin_table_factory.h $(DESTDIR)/$(PREFIX)/include/topling - install -C -m 644 sideplugin/rockside/src/topling/side_plugin_repo.h $(DESTDIR)/$(PREFIX)/include/topling - install -C -m 644 sideplugin/rockside/src/topling/side_plugin_factory.h $(DESTDIR)/$(PREFIX)/include/topling - install -d $(DESTDIR)/$(PREFIX)/include/terark - install -d $(DESTDIR)/$(PREFIX)/include/terark/io - install -d $(DESTDIR)/$(PREFIX)/include/terark/succinct - install -d $(DESTDIR)/$(PREFIX)/include/terark/thread - install -d $(DESTDIR)/$(PREFIX)/include/terark/util - install -d $(DESTDIR)/$(PREFIX)/include/terark/fsa - install -d $(DESTDIR)/$(PREFIX)/include/terark/fsa/ppi - install -d $(DESTDIR)/$(PREFIX)/include/terark/zbs - install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/*.hpp $(DESTDIR)/$(PREFIX)/include/terark - install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/io/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/io - install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/succinct/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/succinct - install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/thread/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/thread - install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/util/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/util - install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/fsa/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/fsa - install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/fsa/*.inl $(DESTDIR)/$(PREFIX)/include/terark/fsa - install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/fsa/ppi/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/fsa/ppi - install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/zbs/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/zbs - cp -ar ${TOPLING_CORE_DIR}/boost-include/boost $(DESTDIR)/$(PREFIX)/include + install -d $(DESTDIR)$(PREFIX)/include/topling + install -C -m 644 sideplugin/rockside/src/topling/json.h $(DESTDIR)$(PREFIX)/include/topling + install -C -m 644 sideplugin/rockside/src/topling/json_fwd.h $(DESTDIR)$(PREFIX)/include/topling + install -C -m 644 sideplugin/rockside/src/topling/builtin_table_factory.h $(DESTDIR)$(PREFIX)/include/topling + install -C -m 644 sideplugin/rockside/src/topling/side_plugin_repo.h $(DESTDIR)$(PREFIX)/include/topling + install -C -m 644 sideplugin/rockside/src/topling/side_plugin_factory.h $(DESTDIR)$(PREFIX)/include/topling + install -d $(DESTDIR)$(PREFIX)/include/terark + install -d $(DESTDIR)$(PREFIX)/include/terark/io + install -d $(DESTDIR)$(PREFIX)/include/terark/succinct + install -d $(DESTDIR)$(PREFIX)/include/terark/thread + install -d $(DESTDIR)$(PREFIX)/include/terark/util + install -d $(DESTDIR)$(PREFIX)/include/terark/fsa + install -d $(DESTDIR)$(PREFIX)/include/terark/fsa/ppi + install -d $(DESTDIR)$(PREFIX)/include/terark/zbs + install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/*.hpp $(DESTDIR)$(PREFIX)/include/terark + install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/io/*.hpp $(DESTDIR)$(PREFIX)/include/terark/io + install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/succinct/*.hpp $(DESTDIR)$(PREFIX)/include/terark/succinct + install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/thread/*.hpp $(DESTDIR)$(PREFIX)/include/terark/thread + install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/util/*.hpp $(DESTDIR)$(PREFIX)/include/terark/util + install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/fsa/*.hpp $(DESTDIR)$(PREFIX)/include/terark/fsa + install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/fsa/*.inl $(DESTDIR)$(PREFIX)/include/terark/fsa + install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/fsa/ppi/*.hpp $(DESTDIR)$(PREFIX)/include/terark/fsa/ppi + install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/zbs/*.hpp $(DESTDIR)$(PREFIX)/include/terark/zbs + cp -ar ${TOPLING_CORE_DIR}/boost-include/boost $(DESTDIR)$(PREFIX)/include + install -d $(DESTDIR)$(PREFIX)/site + install -d $(DESTDIR)$(PREFIX)/toplingdb-conf + install -C -m 644 sideplugin/rockside/src/topling/web/index.html $(DESTDIR)$(PREFIX)/site + install -C -m 644 sideplugin/rockside/src/topling/web/style.css $(DESTDIR)$(PREFIX)/site + install -C -m 644 sideplugin/rockside/sample-conf/db_bench_enterprise.yaml $(DESTDIR)$(PREFIX)/toplingdb-conf install -C -m 644 rocksdb.pc $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc install-static: $(LIBRARY) static_lib @@ -2634,6 +2655,9 @@ install-static: $(LIBRARY) static_lib install-shared: $(SHARED4) shared_lib install -d $(INSTALL_LIBDIR) install -C -m 755 $(SHARED4) $(INSTALL_LIBDIR) +ifeq ($(STRIP_DEBUG_INFO),1) + $(STRIP_CMD) $(INSTALL_LIBDIR)/$(SHARED4) +endif ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED3) ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED2) ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED1) @@ -2643,10 +2667,17 @@ install: install-${LIB_MODE} install-dev-static: install-headers install-static install-dev-shared: install-headers install-shared install-dev: install-dev-${LIB_MODE} +upload-trial: ${OBJ_DIR}/sideplugin/topling-zip_table_reader/top_zip_table_builder.o + ossutil cp --region=cn-qingdao -f \ + $(OBJ_DIR)/sideplugin/topling-zip_table_reader/top_zip_table_builder.o \ + oss://topling-tools/${TRIAL_urldir}/ install-dcompact: install dcompact_worker install -d $(DESTDIR)$(PREFIX)/bin install -C -m 755 sideplugin/topling-dcompact/tools/dcompact/${ORIG_OBJ_DIR}/dcompact_worker.exe $(DESTDIR)$(PREFIX)/bin +ifeq ($(STRIP_DEBUG_INFO),1) + $(STRIP_CMD) $(DESTDIR)$(PREFIX)/bin/dcompact_worker.exe +endif install-tools: install tools mkdir -p $(DESTDIR)$(PREFIX)/bin @@ -3284,10 +3315,18 @@ ${BUILD_ROOT}/lib_static/libterark-zbs-${COMPILER}-${BUILD_TYPE_SIG}.a: +make -C ${TOPLING_CORE_DIR} core fsa zbs ifeq (${WITH_TOPLING_ROCKS},1) -ifneq (,$(wildcard sideplugin/topling-rocks)) +ifneq (,$(wildcard sideplugin/topling-rocks/src/table/top_patent_algo.cc)) sideplugin/topling-rocks/${TOPLING_ROCKS_GIT_VER_SRC}: \ + sideplugin/topling-rocks/Makefile \ $(shell find sideplugin/topling-rocks/{src,tools} -name '*.cc' -o -name '*.h') +make -C sideplugin/topling-rocks ${TOPLING_ROCKS_GIT_VER_SRC} +sideplugin/topling-zip_table_reader/top_zip_table_builder.cc: sideplugin/topling-rocks/${TOPLING_ROCKS_GIT_VER_SRC} +else +${OBJ_DIR}/sideplugin/topling-zip_table_reader/top_zip_table_builder.o: + @mkdir -p $(dir $@) + @cd $(dir $@) && \ + wget https://topling-tools.oss-cn-qingdao.aliyuncs.com/${TRIAL_urldir}/top_zip_table_builder.o || \ + echo 'Download top_zip_table_builder fail, add WITH_TOPLING_ROCKS=0 to make command and try again' endif endif diff --git a/README-zh_cn.md b/README-zh_cn.md index 9be6c5e326..267458e2a3 100644 --- a/README-zh_cn.md +++ b/README-zh_cn.md @@ -4,7 +4,7 @@ ToplingDB 由[北京拓扑岭科技有限公司](https://topling.cn)开发与维 ## 快速开始 ToplingDB 需要 C++17,推荐 gcc 8.3 以上,或者 clang 也行。 -ToplingDB 比 RocksDB 快得多,您可以自己快速验证: +ToplingDB 比 RocksDB 快得多,您可以自己快速验证,[下载 ToplingDB 企业版](https://topling-tools.oss-cn-qingdao.aliyuncs.com/toplingdb-8.10-trail90.tgz),或者自己编译: ### Compile & run db_bench ```bash sudo yum -y install git libaio-devel gcc-c++ gflags-devel zlib-devel bzip2-devel libcurl-devel liburing-devel snappy-devel jemalloc-devel @@ -15,7 +15,9 @@ make -j`nproc` db_bench DEBUG_LEVEL=0 sudo make install PREFIX=/some/path # default is /usr/local ``` -以上编译命令执行后,运行 [db_bench.sh](db_bench.sh)(需要[端口 2011](https://github.com/topling/rockside/blob/master/sample-conf/db_bench_enterprise.yaml#L4 "内嵌的 http web 服务使用端口 2011")),然后使用 ToplingDB:[原生 C++](https://github.com/topling/rockside/wiki/101 "典型场景是从 rocksdb 迁移过来)"),也支持 [Java](https://github.com/topling/rockside/wiki/SidePlugin-Java-Binding "内置在本 github 仓库中") 和 [Rust](https://github.com/topling/rust-toplingdb "另外的专门的 github 仓库")。 +下载解压或者自行编译后,运行 [db_bench.sh](db_bench.sh)(需要[端口 2011](https://github.com/topling/rockside/blob/master/sample-conf/db_bench_enterprise.yaml#L4 "内嵌的 http web 服务使用端口 2011")),然后使用 ToplingDB:[原生 C++](https://github.com/topling/rockside/wiki/101 "典型场景是从 rocksdb 迁移过来)"),也支持 [Java](https://github.com/topling/rockside/wiki/SidePlugin-Java-Binding "内置在本 github 仓库中") 和 [Rust](https://github.com/topling/rust-toplingdb "另外的专门的 github 仓库")。 + +> 自己编译开源版时会自动下载预编译的试用版(90天) ToplingZipTable,如果下载失败,可以给 `make` 传递变量 `WITH_TOPLING_ROCKS=0` 禁用它(或[联系我们](mailto:contact@topling.cn))。 ## 简单介绍 ToplingDB 的子模块 **[rockside](https://github.com/topling/rockside)** 是 ToplingDB 的入口,详情参考 **[SidePlugin wiki](https://github.com/topling/rockside/wiki)**。 @@ -84,12 +86,6 @@ toplingdb ``` make -j`nproc` EXTRA_CXXFLAGS='-DROCKSDB_DYNAMIC_CREATE_CF' rocksdbjava ``` -## License -为了兼容开源协议,下列原先禁止字节跳动使用本软件的条款从 2023-04-24 起已被删除,也就是说,字节跳动使用 ToplingDB 的行为不再是非法的,也不是无耻的。 - -~~我们禁止字节跳动使用本软件,其它条款与上游 RocksDB 完全相同,~~ 详情参考 [LICENSE.Apache](LICENSE.Apache), [COPYING](COPYING), [LICENSE.leveldb](LICENSE.leveldb). - -相应 LICENSE 文件中禁止字节跳动使用本软件的条款也已经删除:[LICENSE.Apache](LICENSE.Apache), [COPYING](COPYING), [LICENSE.leveldb](LICENSE.leveldb).
以下是上游 RocksDB 的原版 README diff --git a/README.md b/README.md index f1220135e1..5615b66384 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ToplingDB is developed and maintained by [Topling Inc](https://topling.cn). See ## Quick Start ToplingDB requires C++17, gcc 8.3 or newer is recommended, clang also works. -ToplingDB is forked form [RocksDB](https://github.com/facebook/rocksdb), much faster than RocksDB, try it by yourself: +ToplingDB is forked form [RocksDB](https://github.com/facebook/rocksdb), much faster than RocksDB, you can [download ToplingDB Enterprise](https://topling-tools.oss-cn-qingdao.aliyuncs.com/toplingdb-8.10-trail90.tgz) or compile it by yourself: ### Compile & run db_bench ```bash sudo yum -y install git libaio-devel gcc-c++ gflags-devel zlib-devel bzip2-devel libcurl-devel liburing-devel snappy-devel jemalloc-devel @@ -16,7 +16,9 @@ make -j`nproc` db_bench DEBUG_LEVEL=0 sudo make install PREFIX=/some/path # default is /usr/local ``` -After compile, you can run bundled [db_bench.sh](db_bench.sh)(need [port 2011](https://github.com/topling/rockside/blob/master/sample-conf/db_bench_enterprise.yaml#L4 "use port 2011 for embeded http server")), then use ToplingDB [in C++](https://github.com/topling/sideplugin-wiki-en/wiki/101 "maybe migrate from rocksdb"), or in [Java](https://github.com/topling/sideplugin-wiki-en/wiki/SidePlugin-Java-Binding "Bundled in this repo"), [Rust](https://github.com/topling/rust-toplingdb "A seperated repo"). +After download+uncompress or compile, you can run bundled [db_bench.sh](db_bench.sh)(need [port 2011](https://github.com/topling/rockside/blob/master/sample-conf/db_bench_enterprise.yaml#L4 "use port 2011 for embeded http server")), then use ToplingDB [in C++](https://github.com/topling/sideplugin-wiki-en/wiki/101 "maybe migrate from rocksdb"), or in [Java](https://github.com/topling/sideplugin-wiki-en/wiki/SidePlugin-Java-Binding "Bundled in this repo"), [Rust](https://github.com/topling/rust-toplingdb "A seperated repo"). + +> During compiling, precompiled ToplingZipTable(90 days trial) will be auto downloaded, if download failed, you can pass `WITH_TOPLING_ROCKS=0` to `make` to disalbe it(or [contact us](mailto:contact@topling.cn)). ## Introduction ToplingDB's submodule **[rockside](https://github.com/topling/rockside)** is the entry point of ToplingDB, see **[SidePlugin wiki](https://github.com/topling/sideplugin-wiki-en/wiki)**. @@ -88,16 +90,6 @@ To enable these features, add `-D${MACRO_NAME}` to var `EXTRA_CXXFLAGS`, such as ``` make -j`nproc` EXTRA_CXXFLAGS='-DROCKSDB_DYNAMIC_CREATE_CF' rocksdbjava ``` -## License -To conform open source license, the following term of disallowing bytedance is deleted since 2023-04-24, -that is say: bytedance using ToplingDB is no longer illeagal and is not a shame. - -~~We disallow bytedance using this software, other terms are identidal with -upstream rocksdb license,~~ see [LICENSE.Apache](LICENSE.Apache), [COPYING](COPYING) and -[LICENSE.leveldb](LICENSE.leveldb). - -The terms of disallowing bytedance are also deleted in [LICENSE.Apache](LICENSE.Apache), [COPYING](COPYING) and -[LICENSE.leveldb](LICENSE.leveldb).

diff --git a/btest-trial.sh b/btest-trial.sh new file mode 100644 index 0000000000..9f91f34c3c --- /dev/null +++ b/btest-trial.sh @@ -0,0 +1,9 @@ +#!/usr/bin/bash + +if [ -e sideplugin/topling-rocks-bak ]; then + echo sideplugin/topling-rocks-bak exists >&2 + exit 1 +fi +mv sideplugin/{topling-rocks,topling-rocks-bak} +make WITH_TOPLING_ROCKS=1 $@ +mv sideplugin/{topling-rocks-bak,topling-rocks} diff --git a/build-min-dep-jni.sh b/build-min-dep-jni.sh new file mode 100644 index 0000000000..d94fad8d92 --- /dev/null +++ b/build-min-dep-jni.sh @@ -0,0 +1,63 @@ +#!/usr/bin/bash + +ROCKSDB_VERSION=`build_tools/version.sh full` + +if [ -z "${TOPLING_VERSION}" ]; then + GITHUB_REF=`git describe --tags --exact-match || git branch --show-current` + # GITHUB_REF seems like: topling-8.10.2-frocksdb-1.0, part will be ignored + TOPLING_VERSION=`echo ${GITHUB_REF} | sed -n 's:^topling-'${ROCKSDB_VERSION}'[-_a-z]*\([.0-9]\):\1:p'` + if [ -z "${TOPLING_VERSION}" ]; then + echo TOPLING_VERSION is not set and can not parse from HEAD ref >&2 + exit 1 + fi +fi +#ROCKSDB_JAVA_VERSION=${ROCKSDB_VERSION}-topling-${TOPLING_VERSION}-trial${TOPLING_ZIP_TABLE_TRIAL_DAYS} +ROCKSDB_JAVA_VERSION=${ROCKSDB_VERSION}-topling-${TOPLING_VERSION} + +export USE_LTO=1 +export UPDATE_REPO=0 +export DEBUG_LEVEL=0 +export DISABLE_JEMALLOC=1 +export ROCKSDB_DISABLE_GFLAGS=1 +export TOPLING_USE_DYNAMIC_TLS=1 +export TOPLING_ZIP_TABLE_TRIAL_DAYS=90 +MAJOR_DOT_MINOR=`build_tools/version.sh major`.`build_tools/version.sh minor` + +rm -rf java/include +rm -rf snappy* lz4* bzip2* +rm -f libsnappy.a liblz4.a libbz2.a +make -j60 libsnappy.a liblz4.a libbz2.a +make rocksdbjava install-dcompact -j`nproc` BUILD_PREFIX=min-dep-jni/ \ + PREFIX=min-dep-jni STRIP_DEBUG_INFO=1 ROCKSDB_JAR_WITH_DYNAMIC_LIBS=1 + +patchelf --replace-needed librocksdb.so.${MAJOR_DOT_MINOR} librocksdbjni-linux64.so min-dep-jni/bin/dcompact_worker.exe + +cd java/target +db_artifactId=`sed -n 's/.*\(f\?rocksdbjni\)<\/artifactId>.*/\1/p' ../pom.xml.template` +TARGET_JAR=${db_artifactId}-${ROCKSDB_JAVA_VERSION}.jar +mv rocksdbjni-${ROCKSDB_VERSION}-linux64.jar ${TARGET_JAR} +rm *.sha1 +( # in sub shell + cd ../../min-dep-jni/bin + jar -uf ../../java/target/${TARGET_JAR} dcompact_worker.exe +) +shasum -a 1 ${TARGET_JAR} > ${TARGET_JAR}.sha1 +md5sum ${TARGET_JAR} > ${TARGET_JAR}.md5 + +source /etc/os-release +if [ "${ID}" = "centos" ]; then + ospart=/${ID}${VERSION_ID} +fi +#ospart # e.g. "/centos7" +dir=toplingdb${ospart}/cn/topling/${db_artifactId}/${ROCKSDB_JAVA_VERSION} +for file in ${TARGET_JAR}{,.sha1,.md5} ; do + ossutil cp --region=cn-qingdao -f $file oss://topling-tools/${dir}/ +done +set +x +echo =========================================== +echo ======== Download URL +echo =========================================== +echo https://topling-tools.oss-cn-qingdao.aliyuncs.com/${dir}/${TARGET_JAR} +echo https://topling-tools.oss-cn-qingdao.aliyuncs.com/${dir}/${TARGET_JAR}.sha1 +echo https://topling-tools.oss-cn-qingdao.aliyuncs.com/${dir}/${TARGET_JAR}.md5 +echo =========================================== diff --git a/build-min-dep-release.sh b/build-min-dep-release.sh new file mode 100644 index 0000000000..8bb12c2501 --- /dev/null +++ b/build-min-dep-release.sh @@ -0,0 +1,11 @@ +#!/usr/bin/bash + +export USE_LTO=1 +export ROCKSDB_DISABLE_GFLAGS=1 +export TOPLING_ZIP_TABLE_TRIAL_DAYS=90 +make UPDATE_REPO=0 DEBUG_LEVEL=0 DISABLE_JEMALLOC=1 TOPLING_USE_DYNAMIC_TLS=1 -j60 libsnappy.a liblz4.a libbz2.a +make rocksdbjava install-dcompact -j`nproc` + DEBUG_LEVEL=0 UPDATE_REPO=0 PREFIX=install-here \ + DISABLE_JEMALLOC=1 TOPLING_USE_DYNAMIC_TLS=1 \ + STRIP_DEBUG_INFO=1 ROCKSDB_JAR_WITH_DYNAMIC_LIBS=1 + diff --git a/build-trial.sh b/build-trial.sh new file mode 100644 index 0000000000..cbd27cc466 --- /dev/null +++ b/build-trial.sh @@ -0,0 +1,42 @@ +#!/usr/bin/bash + +export UPDATE_REPO=0 +#export ROCKSDB_DISABLE_GFLAGS=1 +export TOPLING_ZIP_TABLE_TRIAL_DAYS=90 +MAJOR_DOT_MINOR=`build_tools/version.sh major`.`build_tools/version.sh minor` + +make -j60 libsnappy.a liblz4.a libbz2.a BUILD_PREFIX=bconf-0/ +GetDebugLevel=(2 0) +for ((i=0;i<16;i++)); do + export DEBUG_LEVEL=${GetDebugLevel[$((i/1%2))]} + export USE_LTO=$((i/2%2)) + export DISABLE_JEMALLOC=$((i/4%2)) + export TOPLING_USE_DYNAMIC_TLS=$((i/8%2)) + make -j`nproc` upload-trial BUILD_PREFIX=bconf-${i}/ +done +# The last bconf-15 is release build which: +# DEBUG_LEVEL=0,USE_LTO=1,DISABLE_JEMALLOC=1,TOPLING_USE_DYNAMIC_TLS=1 +export BUILD_PREFIX=bconf-15/ +rm -rf toplingdb-${MAJOR_DOT_MINOR} +rm -rf librocksdb* db_bench +rm -rf sideplugin/topling-dcompact/tools/dcompact/build +make install-dcompact install-dev db_bench -j`nproc` \ + PREFIX=toplingdb-${MAJOR_DOT_MINOR} STRIP_DEBUG_INFO=1 + +install -C -m 755 db_bench toplingdb-${MAJOR_DOT_MINOR}/bin +install -C -m 755 db_bench.sh toplingdb-${MAJOR_DOT_MINOR} +strip toplingdb-${MAJOR_DOT_MINOR}/bin/db_bench +sed -e 's:sideplugin/rockside/src/topling/web:site:' \ + -e 's:sideplugin/rockside/sample-conf:toplingdb-conf:' \ + -e 's:\./db_bench:bin/db_bench:' \ + -e '/ulimit/iexport LD_LIBRARY_PATH=lib:$LD_LIBRARY_PATH' \ + -i toplingdb-${MAJOR_DOT_MINOR}/db_bench.sh +source /etc/os-release +if [ "${ID}" = "centos" ]; then + ospart=-${ID}${VERSION_ID} # e.g. "-centos7" +else + ospart="" # keep empty +fi +sdk=toplingdb-${MAJOR_DOT_MINOR}-trail${TOPLING_ZIP_TABLE_TRIAL_DAYS}${ospart}.tgz +tar czf ${sdk} toplingdb-${MAJOR_DOT_MINOR} +ossutil cp --region=cn-qingdao -f ${sdk} oss://topling-tools/ diff --git a/compile.sh b/compile.sh new file mode 100644 index 0000000000..b50a735c9f --- /dev/null +++ b/compile.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +export SANDCASTLE=1 +export BUILD_PREFIX=../build-toplingdb/ +export PATH=/usr/local/bin:$PATH +export CXX=clang++ +export CC=clang + +ROCKSDB_VERSION=`build_tools/version.sh full` +TOPLING_CORE_DIR=sideplugin/topling-zip +COMPILER=`bash ${TOPLING_CORE_DIR}/get-compiler-name.sh` +WITH_BMI2=`bash ${TOPLING_CORE_DIR}/cpu_has_bmi2.sh` +UNAME_MachineSystem=`uname -m -s | sed 's:[ /]:-:g'` +BUILD_NAME=${UNAME_MachineSystem}-${COMPILER}-bmi2-${WITH_BMI2} +BUILD_ROOT=build/${BUILD_NAME} + +dir=${BUILD_PREFIX}build/${BUILD_NAME}/dbg/v${ROCKSDB_VERSION} +dir_ut=${BUILD_PREFIX}build-ut/${BUILD_NAME}/dbg/v${ROCKSDB_VERSION} +DEBUG_LEVEL=2 +function map() { + if [[ $1 == *test*.o ]]; then + echo $dir_ut/$1 + elif [[ $1 == *.o ]]; then + echo $dir/$1 + else + echo $1 + fi +} +targets=(`for i in $@;do map $i;done`) + +make PREFIX=/opt UPDATE_REPO=0 -j`nproc` ${targets[@]} diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc index 1c0caba93d..617b7939fc 100644 --- a/db/blob/db_blob_basic_test.cc +++ b/db/blob/db_blob_basic_test.cc @@ -942,9 +942,11 @@ TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) { "Version::MultiGet::TamperWithBlobIndex", [&key](void* arg) { KeyContext* const key_context = static_cast(arg); assert(key_context); + #if defined(TOPLINGDB_WITH_TIMESTAMP) assert(key_context->key); + #endif - if (*(key_context->key) == key) { + if (key_context->ukey_without_ts == key) { Slice* const blob_index = key_context->value; assert(blob_index); assert(!blob_index->empty()); diff --git a/db/c.cc b/db/c.cc index 407316f7ca..d62bfbfd71 100644 --- a/db/c.cc +++ b/db/c.cc @@ -50,7 +50,9 @@ #include "rocksdb/write_buffer_manager.h" #include "util/stderr_logger.h" #include "utilities/merge_operators.h" -#include "topling/side_plugin_repo.h" +#include "topling/side_plugin_factory.h" +#include "db/compaction/compaction_executor.h" +#include "logging/logging.h" using ROCKSDB_NAMESPACE::BackupEngine; using ROCKSDB_NAMESPACE::BackupEngineOptions; @@ -4544,6 +4546,10 @@ void rocksdb_readoptions_finish_pin(rocksdb_readoptions_t* opt) { return opt->rep.FinishPin(); } +unsigned char rocksdb_readoptions_is_in_pinning_section(rocksdb_readoptions_t* opt) { + return opt->rep.internal_is_in_pinning_section ? 1 : 0; +} + void rocksdb_readoptions_set_async_queue_depth(rocksdb_readoptions_t* opt, size_t v) { v = std::min(v, (size_t)1024); opt->rep.async_queue_depth = v; @@ -6754,6 +6760,16 @@ const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v, return v->rep.data(); } +struct rocksdb_stdstr_t { + std::string rep; +}; + +rocksdb_stdstr_t* rocksdb_stdstr_create(const char* str, size_t len) { + return new rocksdb_stdstr_t{std::string(str, len)}; +} + +void rocksdb_stdstr_destroy(rocksdb_stdstr_t* v) { delete v; } + // container to keep databases and caches in order to use // ROCKSDB_NAMESPACE::MemoryUtil struct rocksdb_memory_consumers_t { @@ -7005,6 +7021,12 @@ void side_plugin_repo_import_auto_file(side_plugin_repo_t* r, SaveError(errptr, s); } +void side_plugin_repo_import(side_plugin_repo_t* r, + const char* json_str, char** errptr) { + auto s = r->repo.Import(std::string(json_str)); + SaveError(errptr, s); +} + rocksdb_t* side_plugin_repo_open(side_plugin_repo_t* r, rocksdb_column_family_handle_t*** p_cfhs, size_t* num_cf, char** errptr) { if (p_cfhs) { // Open with column families @@ -7070,13 +7092,280 @@ void side_plugin_repo_put_cf_options(side_plugin_repo_t* r, const char* name, r->repo.Put(name, std::make_shared(opt->rep)); } +bool side_plugin_db_options_update_from(rocksdb_options_t* opt, const side_plugin_repo_t* r, const char* name) { + return r->repo.DBOptionsUpdateFrom(&opt->rep, name); +} + +bool side_plugin_cf_options_update_from(rocksdb_options_t* opt, const side_plugin_repo_t* r, const char* name) { + return r->repo.CFOptionsUpdateFrom(&opt->rep, name); +} + void side_plugin_repo_close_all(side_plugin_repo_t* r) { r->repo.CloseAllDB(false); // also close http delete r; } +void side_plugin_repo_forget_db(side_plugin_repo_t* r, rocksdb_t* db) { + r->repo.CloseOneDB(db->rep, false); +} + const char* rocksdb_get_name(rocksdb_t* p) { return p->rep->GetName().c_str(); } +} // end extern "C" + +#define DoPrintLog(...) \ + info_log ? ROCKS_LOG_INFO(info_log, __VA_ARGS__) \ + : (void)fprintf(stderr, __VA_ARGS__) +#define PrintLog(level, fmt, ...) \ + do { if (SidePluginRepo::DebugLevel() >= level) \ + DoPrintLog("%s: " fmt "\n", \ + TERARK_PP_SmartForPrintf(rocksdb::StrDateTimeNow(), ## __VA_ARGS__)); \ + } while (0) +#define TRAC(...) PrintLog(4, "TRAC: " __VA_ARGS__) +#define DEBG(...) PrintLog(3, "DEBG: " __VA_ARGS__) +#define INFO(...) PrintLog(2, "INFO: " __VA_ARGS__) +#define WARN(...) PrintLog(1, "WARN: " __VA_ARGS__) + +namespace ROCKSDB_NAMESPACE { + +template +static void* get_ffi_obj(const FFI_BridgeObject* bridge) { + // existing rocksdb bridge class consitently name ffi_obj as state_. + // bridge itself is always const. + return bridge->state_; +} + +using terark::llong; +template +struct FFI_SerDe : public DcompactSerDeFunc { + virtual void SerializeRequest(FILE* fp, const Object& obj) const final { + ROCKSDB_VERIFY(!IsCompactionWorker()); // phase 1, DB Side + auto bridge = dynamic_cast(&obj); + ROCKSDB_VERIFY(nullptr != bridge); + DEBG("job-%05d cf-%d %s::SerializeRequest: job raw = %.3f GB, zip = %.3f GB, smallest_seqno = %lld", + job_id, m_cp->cf_id, m_name, rawzip[0]/1e9, rawzip[1]/1e9, (llong)m_cp->smallest_seqno); + ffi_vtab.serialize_request(fp, get_ffi_obj(bridge)); + } + virtual void DeSerializeRequest(FILE* fp, Object* obj) const final { + ROCKSDB_VERIFY(IsCompactionWorker()); // phase 2, compact worker side + DEBG("job-%05d cf-%d %s::DeSerializeRequest: job raw = %.3f GB, zip = %.3f GB, smallest_seqno = %lld", + job_id, m_cp->cf_id, m_name, rawzip[0]/1e9, rawzip[1]/1e9, (llong)m_cp->smallest_seqno); + auto bridge = dynamic_cast(obj); + ROCKSDB_VERIFY(nullptr != bridge); + ffi_vtab.deserialize_request(fp, get_ffi_obj(bridge)); + } + virtual void SerializeResponse(FILE* fp, const Object& obj) const final { + ROCKSDB_VERIFY(IsCompactionWorker()); // phase 3, compact worker side + auto bridge = dynamic_cast(&obj); + ROCKSDB_VERIFY(nullptr != bridge); + ffi_vtab.serialize_response(fp, get_ffi_obj(bridge)); + } + virtual void DeSerializeResponse(FILE* fp, Object* obj) const final { + ROCKSDB_VERIFY(!IsCompactionWorker()); // phase 4, DB side + auto bridge = dynamic_cast(obj); + ROCKSDB_VERIFY(nullptr != bridge); + ffi_vtab.deserialize_response(fp, get_ffi_obj(bridge)); + } + FFI_SerDe(const json& js, const SidePluginRepo& repo, + const std::string& name, const side_plugin_ex_vtab_t& vtab) + : m_name(name) + { + ffi_vtab = vtab; + auto cp = m_cp = JS_CompactionParamsDecodePtr(js); + info_log = cp->info_log; + const auto& smallest_user_key = Slice(cp->smallest_user_key).ToString(true/*hex*/); + const auto& largest_user_key = Slice(cp->largest_user_key).ToString(true/*hex*/); + job_id = cp->job_id; + cp->InputBytes(rawzip); + TRAC("job-%05d cf-%d %s::FFI_SerDe: smallest_user_key = %s, largest_user_key = %s, job raw = %.3f GB, zip = %.3f GB", + cp->job_id, cp->cf_id, name, smallest_user_key.c_str(), largest_user_key.c_str(), rawzip[0]/1e9, rawzip[1]/1e9); + } + std::string m_name; + side_plugin_ex_vtab_t ffi_vtab; + const CompactionParams* m_cp; + rocksdb::Logger* info_log; + int job_id; + size_t rawzip[2]; +}; + +template +struct FFI_WebManip : public PluginManipFunc { + virtual void Update(Object* obj, const json& query, const json& body, + const SidePluginRepo& repo) const { + std::string str_qry = query.dump(); + std::string str_body = body.dump(); + auto bridge = dynamic_cast(obj); + ROCKSDB_VERIFY(nullptr != bridge); + auto ffi_repo = (const side_plugin_repo_t*)(&repo); + if (m_ffi_vtab.web_update) { + m_ffi_vtab.web_update(get_ffi_obj(bridge), str_qry.c_str(), str_body.c_str(), ffi_repo); + } + } + virtual std::string ToString(const Object& obj, const json& query, + const SidePluginRepo& repo) const { + std::string str_qry = query.dump(); + auto bridge = dynamic_cast(&obj); + ROCKSDB_VERIFY(nullptr != bridge); + auto ffi_repo = (const side_plugin_repo_t*)(&repo); + rocksdb_stdstr_t* result = m_ffi_vtab.web_view(get_ffi_obj(bridge), str_qry.c_str(), ffi_repo); + TERARK_VERIFY(nullptr != result); + TERARK_SCOPE_EXIT(rocksdb_stdstr_destroy(result)); + return std::move(result->rep); + } + FFI_WebManip(const side_plugin_ex_vtab_t& ffi_vtab) : m_ffi_vtab(ffi_vtab) {} + side_plugin_ex_vtab_t m_ffi_vtab; +}; + +template +static void side_plugin_register_ex(const char* name, const side_plugin_ex_vtab_t* ex_vtab) { + if (ex_vtab->serialize_request) { + ROCKSDB_VERIFY(nullptr != ex_vtab->deserialize_request ); + ROCKSDB_VERIFY(nullptr != ex_vtab-> serialize_response); + ROCKSDB_VERIFY(nullptr != ex_vtab->deserialize_response); + using NoConstObj = std::remove_const_t; + auto cxx_creator = [name=std::string(name), cp_vtab=*ex_vtab] + (const json& js, const SidePluginRepo& repo) -> std::shared_ptr > { + static_assert(offsetof(side_plugin_repo_t, repo) == 0); + return std::make_shared >(js, repo, name, cp_vtab); + }; + SerDeFactory::DoReg(name, cxx_creator, __FILE__, __LINE__); + } + if (ex_vtab->web_view) { + auto cxx_creator = [ + singleton=std::make_shared >(*ex_vtab) + ](const json&, const SidePluginRepo&) -> const PluginManipFunc* { + static_assert(offsetof(side_plugin_repo_t, repo) == 0); + return singleton.get(); + }; + PluginManip::DoReg(name, cxx_creator, __FILE__, __LINE__); + } +} + +} // ROCKSDB_NAMESPACE + +using ROCKSDB_NAMESPACE::SidePluginRepo; +using ROCKSDB_NAMESPACE::PluginFactory; +using ROCKSDB_NAMESPACE::json; +using ROCKSDB_NAMESPACE::side_plugin_register_ex; +using ROCKSDB_NAMESPACE::SerDeFactory; +using ROCKSDB_NAMESPACE::PluginManip; + +template +static void side_plugin_unregister_ex(const char* name) { + using NoConstObj = std::remove_const_t; + SerDeFactory::UnReg(name); + PluginManip::UnReg(name); +} + +template +static void side_plugin_register_raw_ptr_plugin +(const char* name, FFI_BridgeObject*(*creator)(const char* strjson, const side_plugin_repo_t*), + const side_plugin_ex_vtab_t* ex_vtab) +{ + auto cxx_creator = [creator](const json& js, const SidePluginRepo& repo) { + std::string strjson = js.dump(); + static_assert(offsetof(side_plugin_repo_t, repo) == 0); + Object* ptr = creator(strjson.c_str(), (const side_plugin_repo_t*)(&repo)); + return ptr; + }; + PluginFactory::DoReg(name, cxx_creator, __FILE__, __LINE__); + if (ex_vtab) { + side_plugin_register_ex(name, ex_vtab); + } +} + +template +static void side_plugin_register_shared_ptr_plugin +(const char* name, FFI_BridgeObject*(*creator)(const char* strjson, const side_plugin_repo_t*), + const side_plugin_ex_vtab_t* ex_vtab) +{ + auto cxx_creator = [creator](const json& js, const SidePluginRepo& repo) { + std::string strjson = js.dump(); + static_assert(offsetof(side_plugin_repo_t, repo) == 0); + Object* ptr = creator(strjson.c_str(), (const side_plugin_repo_t*)(&repo)); + return std::shared_ptr(ptr); + }; + PluginFactory >::DoReg(name, cxx_creator, __FILE__, __LINE__); + if (ex_vtab) { + side_plugin_register_ex(name, ex_vtab); + } +} + +extern "C" { + +void side_plugin_register_comparator +(const char* name, rocksdb_comparator_creator_t creator, const side_plugin_ex_vtab_t* ex_vtab) { + side_plugin_register_raw_ptr_plugin(name, creator, ex_vtab); +} +void side_plugin_unregister_comparator(const char* name) { + PluginFactory::UnReg(name); + side_plugin_unregister_ex(name); +} +void* side_plugin_comparator_get_state(const rocksdb_comparator_t* p) { + return p->state_; +} + +void side_plugin_register_compaction_filter_factory +(const char* name, rocksdb_compactionfilterfactory_creator_t creator, const side_plugin_ex_vtab_t* ex_vtab) { + side_plugin_register_shared_ptr_plugin(name, creator, ex_vtab); +} +void side_plugin_unregister_compaction_filter_factory(const char* name) { + PluginFactory >::UnReg(name); + side_plugin_unregister_ex(name); +} +void* side_plugin_compactionfilterfactory_get_state(const rocksdb_compactionfilterfactory_t* p) { + return p->state_; +} + +void side_plugin_register_merge_operator +(const char* name, rocksdb_mergeoperator_creator_t creator, const side_plugin_ex_vtab_t* ex_vtab) { + side_plugin_register_shared_ptr_plugin(name, creator, ex_vtab); +} +void side_plugin_unregister_merge_operator(const char* name) { + PluginFactory >::UnReg(name); + side_plugin_unregister_ex(name); +} +void* side_plugin_mergeoperator_get_state(const rocksdb_mergeoperator_t* p) { + return p->state_; +} + +void side_plugin_register_slicetransform +(const char* name, rocksdb_slicetransform_creator_t creator, const side_plugin_ex_vtab_t* ex_vtab) { + side_plugin_register_shared_ptr_plugin(name, creator, ex_vtab); +} +void side_plugin_unregister_slicetransform(const char* name) { + PluginFactory >::UnReg(name); + side_plugin_unregister_ex(name); +} +void* side_plugin_slicetransform_get_state(const rocksdb_slicetransform_t* p) { + return p->state_; +} + +void side_plugin_register_filterpolicy +(const char* name, rocksdb_filterpolicy_creator_t creator, const side_plugin_ex_vtab_t* ex_vtab) { + side_plugin_register_shared_ptr_plugin(name, creator, ex_vtab); +} +void side_plugin_unregister_filterpolicy(const char* name) { + PluginFactory >::UnReg(name); + side_plugin_unregister_ex(name); +} +void* side_plugin_filterpolicy_get_state(const rocksdb_filterpolicy_t* p) { + return p->state_; +} + +#if 0 // rocksdb c api does not support custom rate limiter +void side_plugin_register_ratelimiter +(const char* name, rocksdb_ratelimiter_creator_t creator, const side_plugin_ex_vtab_t* ex_vtab) { + side_plugin_register_shared_ptr_plugin(name, creator, ex_vtab); +} +void side_plugin_unregister_ratelimiter(const char* name) { + PluginFactory >::UnReg(name); +} +void* side_plugin_ratelimiter_get_state(const rocksdb_ratelimiter_creator_t* p) { + return p->state_; +} +#endif + } // end extern "C" diff --git a/db/compaction/compaction_executor.cc b/db/compaction/compaction_executor.cc index c4a1888704..534abd5b3a 100644 --- a/db/compaction/compaction_executor.cc +++ b/db/compaction/compaction_executor.cc @@ -278,19 +278,6 @@ std::string ReplacePrefix(Slice Old, Slice New, Slice str) { int(str.size()), str.data(), int(Old.size()), Old.data()); } -void ReplaceAll(std::string& str, Slice from, Slice to) { - if (from.empty()) return; - size_t start_pos = 0; - while ((start_pos = str.find(from.data(), start_pos)) != std::string::npos) { - str.replace(start_pos, from.size(), to.data(), to.size()); - start_pos += to.size(); - } -} -std::string ReplaceAll(Slice str, Slice from, Slice to) { - std::string tmp(str.data(), str.size()); - ReplaceAll(tmp, from, to); - return tmp; -} std::string MakePath(std::string dir, Slice sub) { while (!dir.empty() && '/' == dir.back()) { dir.pop_back(); diff --git a/db/compaction/compaction_executor.h b/db/compaction/compaction_executor.h index 477d369ec0..5fd4cb29a0 100644 --- a/db/compaction/compaction_executor.h +++ b/db/compaction/compaction_executor.h @@ -182,8 +182,6 @@ class CompactionExecutorFactory { std::string GetDirFromEnv(const char* name, const char* Default = nullptr); bool ReplacePrefix(Slice Old, Slice New, Slice str, std::string* res); std::string ReplacePrefix(Slice Old, Slice New, Slice str); -void ReplaceAll(std::string& str, Slice from, Slice to); -std::string ReplaceAll(Slice str, Slice from, Slice to); std::string MakePath(std::string dir, Slice sub); std::string& AppendJobID(std::string& path, int job_id); std::string CatJobID(const std::string& path, int job_id); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 097163c60d..57ce84a663 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -151,16 +151,20 @@ struct ToplingMGetCtx : protected MergeContext { #if defined(TOPLINGDB_WITH_TIMESTAMP) std::string* timestamp = nullptr; -#endif union { LookupKey lkey; }; +#endif + union { + ParsedInternalKey pikey; + }; void InitLookupKey(const Slice& user_key, SequenceNumber seq, const Slice* ts) { #if defined(TOPLINGDB_WITH_TIMESTAMP) new(&lkey)LookupKey(user_key, seq, ts); + new(&pikey)ParsedInternalKey(lkey.internal_key()); #else - new(&lkey)LookupKey(user_key, seq); + new(&pikey)ParsedInternalKey(user_key, seq, kValueTypeForSeek); (void)ts; assert(ts == nullptr); #endif @@ -168,8 +172,10 @@ struct ToplingMGetCtx : protected MergeContext { } ToplingMGetCtx() {} ~ToplingMGetCtx() { +#if defined(TOPLINGDB_WITH_TIMESTAMP) if (this->ext_flags_ & FLAG_lkey_initialized) lkey.~LookupKey(); +#endif } void set_done() { this->ext_flags_ |= FLAG_done; } bool is_done() const { return (this->ext_flags_ & FLAG_done) != 0; } @@ -828,6 +834,8 @@ Status DBImpl::CloseHelper() { Status DBImpl::CloseImpl() { return CloseHelper(); } DBImpl::~DBImpl() { + MaybeForgetDB(this); + // TODO: remove this. init_logger_creation_s_.PermitUncheckedError(); @@ -2442,7 +2450,7 @@ Status DBImpl::GetInst(const ReadOptions& read_options, const Slice& key, #if defined(TOPLINGDB_WITH_TIMESTAMP) LookupKey lkey(key, snapshot, read_options.timestamp); #else - LookupKey lkey(key, snapshot); + ParsedInternalKey lkey(key, snapshot, kValueTypeForSeek); #endif PERF_TIMER_STOP(get_snapshot_time); @@ -2511,7 +2519,7 @@ Status DBImpl::GetInst(const ReadOptions& read_options, const Slice& key, PinnedIteratorsManager pinned_iters_mgr; if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); - sv->current->Get( + sv->current->template Get( read_options, lkey, get_impl_options.value, get_impl_options.columns, timestamp, &s, &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, @@ -2769,7 +2777,7 @@ std::vector DBImpl::MultiGet( LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp); #else std::string* timestamp = nullptr; - LookupKey lkey(keys[keys_read], consistent_seqnum); + ParsedInternalKey lkey(keys[keys_read], consistent_seqnum, kValueTypeForSeek); #endif auto cfh = static_cast_with_check( @@ -3241,7 +3249,7 @@ struct CompareKeyContext { // Both keys are from the same column family int cmp = comparator->CompareWithoutTimestamp( - *(lhs->key), /*a_has_ts=*/false, *(rhs->key), /*b_has_ts=*/false); + lhs->ukey_without_ts, /*a_has_ts=*/false, rhs->ukey_without_ts, /*b_has_ts=*/false); if (cmp < 0) { return true; } @@ -3253,7 +3261,7 @@ struct CompareKeyContextSameCF { const Comparator* comparator; inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) { int cmp = comparator->CompareWithoutTimestamp( - *(lhs->key), /*a_has_ts=*/false, *(rhs->key), /*b_has_ts=*/false); + lhs->ukey_without_ts, /*a_has_ts=*/false, rhs->ukey_without_ts, /*b_has_ts=*/false); return cmp < 0; } }; @@ -3498,7 +3506,7 @@ if (UNLIKELY(!g_MultiGetUseFiber)) { auto& max_covering_tombstone_seq = ctx_vec[i].max_covering_tombstone_seq; MergeContext& merge_context = ctx_vec[i].merge_context(); Status& s = statuses[i]; - if (sv->mem->Get(ctx_vec[i].lkey, &values[i], columns, + if (sv->mem->Get(ctx_vec[i].pikey, &values[i], columns, timestamp, &s, &merge_context, &max_covering_tombstone_seq, read_options, false, // immutable_memtable @@ -3506,7 +3514,7 @@ if (UNLIKELY(!g_MultiGetUseFiber)) { ctx_vec[i].set_done(); hits++; } else if ((s.ok() || s.IsMergeInProgress()) && - sv->imm->Get(ctx_vec[i].lkey, &values[i], columns, + sv->imm->Get(ctx_vec[i].pikey, &values[i], columns, timestamp, &s, &merge_context, &max_covering_tombstone_seq, read_options, callback, is_blob_index)) { @@ -3527,7 +3535,7 @@ if (UNLIKELY(!g_MultiGetUseFiber)) { bool* value_found = nullptr; bool get_value = true; sv->current->Get( - read_options, ctx_vec[i].lkey, &values[i], columns, + read_options, ctx_vec[i].pikey, &values[i], columns, timestamp, &statuses[i], &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, value_found, @@ -3537,13 +3545,14 @@ if (UNLIKELY(!g_MultiGetUseFiber)) { get_value); counting++; }; - if (read_options.async_io) { + const bool async_io = read_options.async_io; + if (async_io) { gt_fiber_pool.update_fiber_count(read_options.async_queue_depth); } size_t memtab_miss = 0; for (size_t i = 0; i < num_keys; i++) { if (!ctx_vec[i].is_done()) { - if (read_options.async_io) { + if (async_io) { gt_fiber_pool.push({TERARK_C_CALLBACK(get_in_sst), i}); } else { get_in_sst(i); @@ -3578,6 +3587,10 @@ if (UNLIKELY(!g_MultiGetUseFiber)) { if (!read_options.internal_is_in_pinning_section) ReturnAndCleanupSuperVersion(cfd, sv); +#else + for (size_t i = 0; i < num_keys; i++) { + statuses[i] = Status::NotSupported("macro TOPLINGDB_WITH_FIBER_AIO is 0 but env MultiGetUseFiber is true"); + } #endif // TOPLINGDB_WITH_FIBER_AIO } // g_MultiGetUseFiber } @@ -4008,6 +4021,10 @@ void DB_UpdateMaxColumnFamily(DB* db, uint32_t max_cf_id) { cfset->UpdateMaxColumnFamily(max_cf_id); } +ColumnFamilyHandle* DB_persist_stats_cf_handle(const DB* db) { + return static_cast_with_check(db)->persist_stats_cf_handle(); +} + Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, const std::string& column_family_name, ColumnFamilyHandle** handle) { @@ -4017,6 +4034,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, Status s; *handle = nullptr; + MaybeCFOptionsUpdateFrom(const_cast(&cf_options), + column_family_name, dbname_); + ROCKSDB_SCOPE_EXIT(MaybeRetainCF(this, *handle)); DBOptions db_options = BuildDBOptions(immutable_db_options_, mutable_db_options_); s = ColumnFamilyData::ValidateOptions(db_options, cf_options); @@ -4142,6 +4162,8 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { return Status::InvalidArgument("Can't drop default column family"); } + MaybeForgetCF(this, column_family); + bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); VersionEdit edit; @@ -5140,7 +5162,7 @@ ReadOptions::~ReadOptions() { SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd, const ReadOptions* ro) { - if (!ro->internal_is_in_pinning_section) { + if (UNLIKELY(!ro->internal_is_in_pinning_section)) { // do not use zero copy, same as old behavior return GetAndRefSuperVersion(cfd); } @@ -5148,7 +5170,7 @@ DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd, const ReadOptions* ro) { ROCKSDB_ASSERT_EQ(tls->thread_id, ThisThreadID()); size_t cfid = cfd->GetID(); SuperVersion*& sv = tls->GetSuperVersionRef(cfid); - if (sv) { + if (LIKELY(sv != nullptr)) { if (LIKELY(sv->version_number == cfd->GetSuperVersionNumberNoAtomic())) { ROCKSDB_ASSERT_EQ(sv->cfd, cfd); return sv; @@ -6249,7 +6271,7 @@ Status DBImpl::GetLatestSequenceForKey( #if !defined(NDEBUG) constexpr size_t ts_sz = 0; #endif - LookupKey lkey(key, current_seq); + ParsedInternalKey lkey(key, current_seq, kValueTypeForSeek); #endif *seq = kMaxSequenceNumber; diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 97bc9c3e2e..e26709b39b 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -448,7 +448,7 @@ class DBImpl : public DB { virtual Status LockWAL() override; virtual Status UnlockWAL() override; - virtual SequenceNumber GetLatestSequenceNumber() const override; + virtual SequenceNumber GetLatestSequenceNumber() const override final; // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire // and release db_mutex @@ -466,7 +466,7 @@ class DBImpl : public DB { virtual Status GetDbSessionId(std::string& session_id) const override; - ColumnFamilyHandle* DefaultColumnFamily() const override; + ColumnFamilyHandle* DefaultColumnFamily() const override final; ColumnFamilyHandle* PersistentStatsColumnFamily() const; @@ -677,7 +677,7 @@ class DBImpl : public DB { bool expose_blob_index = false, bool allow_refresh = true); - virtual SequenceNumber GetLastPublishedSequence() const { + virtual SequenceNumber GetLastPublishedSequence() const final { if (last_seq_same_as_publish_seq_) { return versions_->LastSequence(); } else { @@ -687,7 +687,7 @@ class DBImpl : public DB { // REQUIRES: joined the main write queue if two_write_queues is disabled, and // the second write queue otherwise. - virtual void SetLastPublishedSequence(SequenceNumber seq); + virtual void SetLastPublishedSequence(SequenceNumber seq) final; // Returns LastSequence in last_seq_same_as_publish_seq_ // mode and LastAllocatedSequence otherwise. This is useful when visiblility // depends also on data written to the WAL but not to the memtable. @@ -2241,6 +2241,8 @@ class DBImpl : public DB { SnapshotImpl* GetSnapshotImpl(SequenceNumber snapshot_seq, bool is_write_conflict_boundary, bool lock = true); + ColumnFamilyHandle* persist_stats_cf_handle() const { return persist_stats_cf_handle_; } + protected: // If snapshot_seq != kMaxSequenceNumber, then this function can only be @@ -2805,6 +2807,15 @@ class GetWithTimestampReadCallback : public ReadCallback { } }; +extern bool MaybeCFOptionsUpdateFrom +(ColumnFamilyOptions*, const std::string& cfname, const std::string& dbpath); +extern bool MaybeOptionsUpdateFrom +(DBOptions*, std::vector*, const std::string& dbpath); +extern void MaybeRetainDB(DB*, const std::vector&); +extern void MaybeForgetDB(DB*); +extern void MaybeRetainCF(DB*, ColumnFamilyHandle*); +extern void MaybeForgetCF(DB*, ColumnFamilyHandle*); + extern Options SanitizeOptions(const std::string& db, const Options& src, bool read_only = false, Status* logger_creation_s = nullptr); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 476ea032fa..8c62470ac0 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1167,6 +1167,15 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, logFileDropped(); continue; } + bool wal_memtable_format = immutable_db_options_.memtable_as_log_index; + if (immutable_db_options_.check_wal_format) { + if (IOStatus ios = log::Reader::IsMemTableAsLogIndexFile + (*fs_, fname, &wal_memtable_format); !ios.ok()) { + auto info_log = immutable_db_options_.info_log.get(); + ROCKS_LOG_WARN(info_log, "%s: %s", fname.c_str(), *ios.ToSSO()); + return Status(ios); + } + } std::unique_ptr file_reader; { @@ -1207,7 +1216,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), &reporter, true /*checksum*/, wal_number); boost::intrusive_ptr fmap; - if (immutable_db_options_.memtable_as_log_index) { + if (wal_memtable_format) { reader.InitSetMemTableAsLogIndex(*fs_); IOStatus ios = ReadonlyFileMmap::New(&fmap, *fs_, wal_number, fname); if (!ios.ok() && ios.ToString() != "Invalid argument: Empty File") @@ -1255,7 +1264,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, if (!status.ok()) { return status; } - if (new_batch && immutable_db_options_.memtable_as_log_index) { + if (new_batch && wal_memtable_format) { return Status::NotSupported("memtable_as_log_index", "WriteBatchTimestampSizeDifference"); } @@ -1980,6 +1989,12 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, DB** dbptr, const bool seq_per_batch, const bool batch_per_txn) { + MaybeOptionsUpdateFrom(const_cast(&db_options), + const_cast*>(&column_families), + dbname); + *dbptr = nullptr; + ROCKSDB_SCOPE_EXIT(MaybeRetainDB(*dbptr, *handles)); + Status s = ValidateOptionsByTable(db_options, column_families); if (!s.ok()) { return s; diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index 83e7a9a3bb..2cf43b8f07 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -94,10 +94,12 @@ Status DBImplReadOnly::GetImpl(const ReadOptions& read_options, return s; } } + LookupKey lkey(key, snapshot, read_options.timestamp); + #else + ParsedInternalKey lkey(key, snapshot, kValueTypeForSeek); #endif MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; - LookupKey lkey(key, snapshot, read_options.timestamp); PERF_TIMER_STOP(get_snapshot_time); // Look up starts here @@ -345,6 +347,11 @@ Status DBImplReadOnly::OpenForReadOnlyWithoutCheck( *dbptr = nullptr; handles->clear(); + MaybeOptionsUpdateFrom(const_cast(&db_options), + const_cast*>(&column_families), + dbname); + ROCKSDB_SCOPE_EXIT(MaybeRetainDB(*dbptr, *handles)); + SuperVersionContext sv_context(/* create_superversion */ true); DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); impl->mutex_.Lock(); diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 99422b4eb1..8f4d2e6415 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -167,11 +167,21 @@ Status DBImplSecondary::MaybeInitLogReader( io_tracer_)); } + bool wal_memtable_format = immutable_db_options_.memtable_as_log_index; + if (immutable_db_options_.check_wal_format) { + if (IOStatus ios = log::Reader::IsMemTableAsLogIndexFile + (*fs_, fname, &wal_memtable_format); !ios.ok()) { + auto info_log = immutable_db_options_.info_log.get(); + ROCKS_LOG_WARN(info_log, "%s: %s", fname.c_str(), *ios.ToSSO()); + return Status(ios); + } + } + // Create the log reader. LogReaderContainer* log_reader_container = new LogReaderContainer( env_, immutable_db_options_.info_log, fname, std::move(file_reader), log_number); - if (immutable_db_options_.memtable_as_log_index) { + if (wal_memtable_format) { // will tailing log Reader, so must preserve mmap size auto mmap_size = GetMaxTotalWalSize() + 8*1024*1024; if (mmap_size > (1ull << 40)) { @@ -424,7 +434,7 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, #if defined(TOPLINGDB_WITH_TIMESTAMP) LookupKey lkey(key, snapshot, read_options.timestamp); #else - LookupKey lkey(key, snapshot); + ParsedInternalKey lkey(key, snapshot, kValueTypeForSeek); #endif PERF_TIMER_STOP(get_snapshot_time); bool done = false; @@ -801,6 +811,10 @@ Status DB::OpenAsSecondary( *dbptr = nullptr; DBOptions tmp_opts(db_options); + MaybeOptionsUpdateFrom(&tmp_opts, + const_cast*>(&column_families), + dbname); + ROCKSDB_SCOPE_EXIT(MaybeRetainDB(*dbptr, *handles)); Status s; if (nullptr == tmp_opts.info_log) { s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log); diff --git a/db/db_iter.cc b/db/db_iter.cc index 969c8df2d6..56ca16d42a 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -595,7 +595,7 @@ struct VirtualCmpNoTS { template __always_inline void DBIter::FastIterKey::SetUK(const Slice& uk_slice) { - static_assert(UserKeyLen < sizeof(key)); + static_assert(UserKeyLen + 8 < sizeof(key)); auto uk_ptr = uk_slice.data(); auto uk_len = uk_slice.size(); if constexpr (UserKeyLen == 0) { @@ -612,8 +612,9 @@ void DBIter::FastIterKey::SetUK(const Slice& uk_slice) { _mm512_mask_storeu_epi8(buf, mask, r512); // do not write last 8 bytes(seq + value_type) }); - #elif defined(__clang__) || !defined(__GNUC__) || __GNUC__ >= 13 - static_assert(false, "UserKeyLen == 64 should not on non-avx512"); + #else + // (UserKeyLen != 64) == false here, for workardound + static_assert(UserKeyLen != 64, "UserKeyLen == 64 should not on non-avx512"); #endif } else { ROCKSDB_ASSERT_EQ(uk_len, UserKeyLen); @@ -2007,9 +2008,9 @@ void DBIter::Seek(const Slice& target) { if (statistics_ != nullptr) { // Decrement since we don't want to count this key as skipped RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); - RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + RecordTick(statistics_, ITER_BYTES_READ, key().size()); } - PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size()); //local_stats_.BumpGlobalStatistics(statistics_); } diff --git a/db/db_iter.h b/db/db_iter.h index caea275161..9cc825f1c0 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -463,10 +463,10 @@ class DBIter final : public Iterator { if constexpr (FixLen == 64) // avx512 FixLen==64 means max is 64(without seqvt 8) return key.risk_to_str_local().notail(8); - if constexpr (FixLen != 0) + if constexpr (FixLen != 0) // FixLen != 0 means fixed len return key.risk_to_str_local_known_len().notail(8); else - return GetUserKey(); + return GetUserKey(); // not fixed len, a bit slower } Slice GetUserKey() const { return key.notail(8); } Slice GetInternalKey() const { return key.to(); } diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 2942379729..bd17b7c047 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -302,7 +302,7 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) { Status status; ReadOptions roptions; SequenceNumber max_covering_tombstone_seq = 0; - LookupKey lkey("key", kMaxSequenceNumber); + ParsedInternalKey lkey("key", kMaxSequenceNumber, kValueTypeForSeek); PinnableSlice pin; bool res = mem->Get(lkey, &pin, /*columns=*/nullptr, /*timestamp=*/nullptr, &status, &merge_context, &max_covering_tombstone_seq, diff --git a/db/dbformat.h b/db/dbformat.h index 9158cb9b7f..c90b1b2cec 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -22,6 +22,9 @@ #include "rocksdb/types.h" #include "util/coding.h" #include "util/user_comparator_wrapper.h" +#include + +#define TOPLINGDB_OMIT_LOOKUP_KEY 1 namespace ROCKSDB_NAMESPACE { @@ -161,6 +164,18 @@ struct ParsedInternalKey { const char* addr = user_key.data() + user_key.size() - ts_sz; return Slice(const_cast(addr), ts_sz); } + + struct InternalKeyBuf : private terark::minimal_sso<64> { + explicit InternalKeyBuf(const ParsedInternalKey& pik) : + terark::minimal_sso<64>(pik.user_key.size() + 8, + [&](char* buf, size_t len) { + EncodeFixed64(buf + (len - 8), pik.GetTag()); + memcpy(buf, pik.user_key.data(), len - 8); + }) + {} + operator Slice() const { return this->to(); } + }; + InternalKeyBuf MakeInternalKeyBuf() const { return InternalKeyBuf(*this); } }; static_assert(sizeof(ParsedInternalKey) == 32); @@ -1247,6 +1262,78 @@ struct BytewiseCompareInternalKey { return GetUnalignedU64(px + n) > GetUnalignedU64(py + n); #endif } + __always_inline bool operator()(const ParsedInternalKey& x, Slice y) const noexcept { + ROCKSDB_ASSERT_GE(y.size_, 8); + #if !TOPLINGDB_USE_MANUAL_MEMCMP + size_t n = std::min(x.user_key.size_, y.size_ - 8); + int cmp = memcmp(x.user_key.data_, y.data_, n); + if (0 != cmp) return cmp < 0; + if (x.user_key.size_ != y.size_ - 8) return x.user_key.size_ < y.size_ - 8; + return x.GetTag() > GetUnalignedU64(y.data_ + n); + #else + auto px = (const unsigned char*)x.user_key.data(); size_t nx = x.user_key.size(); + auto py = (const unsigned char*)y.data(); size_t ny = y.size() - 8; + size_t i = 0, n = std::min(nx, ny); + for (; i + 8 <= n; i += 8) { + auto ux = NativeOfBigEndian64(*(const uint64_t*)(px + i)); + auto uy = NativeOfBigEndian64(*(const uint64_t*)(py + i)); + if (ux != uy) + return ux < uy; + } + if (n % sizeof(uint64_t) >= 4) { + auto ux = NativeOfBigEndian32(*(const uint32_t*)(px + i)); + auto uy = NativeOfBigEndian32(*(const uint32_t*)(py + i)); + if (ux != uy) + return ux < uy; + else + i += 4; + } + for (; i < n; i++) { + int ux = px[i], uy = py[i]; + if (ux != uy) + return ux < uy; + } + if (nx != ny) + return nx < ny; + return x.GetTag() > GetUnalignedU64(py + n); + #endif + } + __always_inline bool operator()(Slice x, const ParsedInternalKey& y) const noexcept { + ROCKSDB_ASSERT_GE(x.size_, 8); + #if !TOPLINGDB_USE_MANUAL_MEMCMP + size_t n = std::min(x.size_ - 8, y.user_key.size_); + int cmp = memcmp(x.data_, y.user_key.data_, n); + if (0 != cmp) return cmp < 0; + if (x.size_ - 8 != y.user_key.size_) return x.size_ - 8 < y.user_key.size_; + return GetUnalignedU64(x.data_ + n) > y.GetTag(); + #else + auto px = (const unsigned char*)x.data(); size_t nx = x.size() - 8; + auto py = (const unsigned char*)y.user_key.data(); size_t ny = y.user_key.size(); + size_t i = 0, n = std::min(nx, ny); + for (; i + 8 <= n; i += 8) { + auto ux = NativeOfBigEndian64(*(const uint64_t*)(px + i)); + auto uy = NativeOfBigEndian64(*(const uint64_t*)(py + i)); + if (ux != uy) + return ux < uy; + } + if (n % sizeof(uint64_t) >= 4) { + auto ux = NativeOfBigEndian32(*(const uint32_t*)(px + i)); + auto uy = NativeOfBigEndian32(*(const uint32_t*)(py + i)); + if (ux != uy) + return ux < uy; + else + i += 4; + } + for (; i < n; i++) { + int ux = px[i], uy = py[i]; + if (ux != uy) + return ux < uy; + } + if (nx != ny) + return nx < ny; + return GetUnalignedU64(px + n) > y.GetTag(); + #endif + } __always_inline bool operator()(uint64_t x, uint64_t y) const noexcept { return x < y; } @@ -1254,19 +1341,38 @@ struct BytewiseCompareInternalKey { }; struct RevBytewiseCompareInternalKey { __always_inline bool operator()(Slice x, Slice y) const noexcept { + ROCKSDB_ASSERT_GE(x.size_, 8); + ROCKSDB_ASSERT_GE(y.size_, 8); size_t n = std::min(x.size_, y.size_) - 8; int cmp = memcmp(x.data_, y.data_, n); if (0 != cmp) return cmp > 0; if (x.size_ != y.size_) return x.size_ > y.size_; return GetUnalignedU64(x.data_ + n) > GetUnalignedU64(y.data_ + n); } + __always_inline bool operator()(const ParsedInternalKey& x, Slice y) const noexcept { + ROCKSDB_ASSERT_GE(y.size_, 8); + size_t n = std::min(x.user_key.size_, y.size_ - 8); + int cmp = memcmp(x.user_key.data_, y.data_, n); + if (0 != cmp) return cmp > 0; + if (x.user_key.size_ != y.size_ - 8) return x.user_key.size_ > y.size_ - 8; + return x.GetTag() > GetUnalignedU64(y.data_ + n); + } + __always_inline bool operator()(Slice x, const ParsedInternalKey& y) const noexcept { + ROCKSDB_ASSERT_GE(x.size_, 8); + size_t n = std::min(x.size_ - 8, y.user_key.size_); + int cmp = memcmp(x.data_, y.user_key.data_, n); + if (0 != cmp) return cmp > 0; + if (x.size_ - 8 != y.user_key.size_) return x.size_ - 8 > y.user_key.size_; + return GetUnalignedU64(x.data_ + n) > y.GetTag(); + } __always_inline bool operator()(uint64_t x, uint64_t y) const noexcept { return x > y; } RevBytewiseCompareInternalKey(...) {} }; struct FallbackVirtCmp { - __always_inline bool operator()(Slice x, Slice y) const { + template + __always_inline bool operator()(const KeyX& x, const KeyY& y) const { return icmp->Compare(x, y) < 0; } const InternalKeyComparator* icmp; diff --git a/db/flush_job.cc b/db/flush_job.cc index 9da817ac56..1adb729d23 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -748,7 +748,7 @@ bool FlushJob::MemPurgeDecider(double threshold) { // Count entry bytes as payload. payload += entry_size; - LookupKey lkey(res.user_key, kMaxSequenceNumber); + ParsedInternalKey lkey(res.user_key, kMaxSequenceNumber, kValueTypeForSeek); // Paranoia: zero out these values just in case. max_covering_tombstone_seq = 0; diff --git a/db/internal_stats.cc b/db/internal_stats.cc index a953e5a8bb..940361e75f 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -1624,7 +1624,7 @@ void InternalStats::DumpDBStats(std::string* value) { // The format is the same for interval stats. snprintf(buf, sizeof(buf), "Cumulative writes: %s writes, %s keys, %s commit groups, " - "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n", + "%.1f writes per commit group, ingest: %7.2f GB, %7.2f MB/s\n", NumberToHumanString(write_other + write_self).c_str(), NumberToHumanString(num_keys_written).c_str(), NumberToHumanString(write_self).c_str(), @@ -1635,20 +1635,13 @@ void InternalStats::DumpDBStats(std::string* value) { value->append(buf); // WAL snprintf(buf, sizeof(buf), - "Cumulative WAL: %s writes, %s syncs, " - "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", + "Cumulative WAL : %s writes, %s sync, " + "%7.2f writes per sync, written: %7.2f GB, %7.2f MB/s\n", NumberToHumanString(write_with_wal).c_str(), NumberToHumanString(wal_synced).c_str(), write_with_wal / std::max(1.0, static_cast(wal_synced)), wal_bytes / kGB, wal_bytes / kMB / std::max(seconds_up, 0.001)); value->append(buf); - // Stall - AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true); - snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n", - human_micros, - // 10000 = divide by 1M to get secs, then multiply by 100 for pct - write_stall_micros / 10000.0 / std::max(seconds_up, 0.001)); - value->append(buf); // Interval uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other; @@ -1657,8 +1650,8 @@ void InternalStats::DumpDBStats(std::string* value) { num_keys_written - db_stats_snapshot_.num_keys_written; snprintf( buf, sizeof(buf), - "Interval writes: %s writes, %s keys, %s commit groups, " - "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n", + "Interval writes: %s writes, %s keys, %s commit groups, " + "%.1f writes per commit group, ingest: %7.2f MB, %7.2f MB/s\n", NumberToHumanString(interval_write_other + interval_write_self).c_str(), NumberToHumanString(interval_num_keys_written).c_str(), NumberToHumanString(interval_write_self).c_str(), @@ -1675,8 +1668,8 @@ void InternalStats::DumpDBStats(std::string* value) { uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes; snprintf(buf, sizeof(buf), - "Interval WAL: %s writes, %s syncs, " - "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", + "Interval WAL : %s writes, %s sync, " + "%7.2f writes per sync, written: %7.2f GB, %7.2f MB/s\n", NumberToHumanString(interval_write_with_wal).c_str(), NumberToHumanString(interval_wal_synced).c_str(), interval_write_with_wal / @@ -1685,10 +1678,17 @@ void InternalStats::DumpDBStats(std::string* value) { interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001)); value->append(buf); + // Stall + AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true); + snprintf(buf, sizeof(buf), "Cumulative stall : %s, %.1f percent\n", + human_micros, + // 10000 = divide by 1M to get secs, then multiply by 100 for pct + write_stall_micros / 10000.0 / std::max(seconds_up, 0.001)); + value->append(buf); // Stall AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros, human_micros, kHumanMicrosLen, true); - snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros, + snprintf(buf, sizeof(buf), "Interval stall : %s, %.1f percent\n", human_micros, // 10000 = divide by 1M to get secs, then multiply by 100 for pct (write_stall_micros - db_stats_snapshot_.write_stall_micros) / 10000.0 / std::max(interval_seconds_up, 0.001)); diff --git a/db/lookup_key.h b/db/lookup_key.h index ee9c889c37..133f150778 100644 --- a/db/lookup_key.h +++ b/db/lookup_key.h @@ -14,6 +14,7 @@ #include "rocksdb/slice.h" #include "rocksdb/types.h" #include "port/likely.h" +#include "dbformat.h" namespace ROCKSDB_NAMESPACE { @@ -62,6 +63,10 @@ class LookupKey { return Slice(longstart_, klength_ - 8); } + operator ParsedInternalKey() const { + return ParsedInternalKey(internal_key()); + } + private: // We construct a char array of the form: // short keys: klength_ <= sizeof(space_) diff --git a/db/memtable.cc b/db/memtable.cc index 1abb6ec0bb..6c8bbfab61 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -170,6 +170,10 @@ MemTable::~MemTable() { assert(refs_ == 0); } +// for ApproximateMemoryUsage(insert_hints_) +static size_t ApproximateMemoryUsage(const terark::hash_strmap& map) { + return map.capacity() * 16 + map.strpool_capacity() + map.bucket_size() * 4; +} size_t MemTable::ApproximateMemoryUsage() { size_t usages[] = { arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(), @@ -298,6 +302,26 @@ int MemTable::KeyComparator::operator()( return comparator.CompareKeySeq(a, key); } +int MemTable::KeyComparator::operator()(const char* prefix_len_key, + const ParsedInternalKey& b) const { + Slice a = GetLengthPrefixedSlice(prefix_len_key); + return comparator.Compare(a, b); +} + +int MemTable::KeyComparator::operator()(const ParsedInternalKey& a, + const char* prefix_len_key) const { + Slice b = GetLengthPrefixedSlice(prefix_len_key); + return comparator.Compare(a, b); +} + +void MemTableRep::GetPIK(const struct ReadOptions& ro, + const ParsedInternalKey& pik, void* callback_args, + bool (*callback_func)(void* arg, const KeyValuePair&)) +{ + LookupKey lk(pik.user_key, pik.sequence); + Get(ro, lk, callback_args, callback_func); +} + void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) { throw std::runtime_error("concurrent insert not supported"); } @@ -924,7 +948,12 @@ namespace { struct Saver { Status* status; - const LookupKey* key; + struct LikeLookupKey : private Slice { + using Slice::operator=; + const Slice& user_key() const { return *this; } + const LikeLookupKey* operator->() const { return this; } + }; + LikeLookupKey key; PinnableSlice* value; PinnableWideColumns* columns; SequenceNumber seq; @@ -1308,7 +1337,7 @@ static bool SaveValue(void* arg, const MemTableRep::KeyValuePair& pair) { } ROCKSDB_FLATTEN -bool MemTable::Get(const LookupKey& key, PinnableSlice* value, +bool MemTable::Get(const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, @@ -1325,11 +1354,11 @@ bool MemTable::Get(const LookupKey& key, PinnableSlice* value, std::unique_ptr range_del_iter( NewRangeTombstoneIterator(read_opts, - GetInternalKeySeqno(key.internal_key()), + key.sequence, immutable_memtable)); if (range_del_iter != nullptr) { SequenceNumber covering_seq = - range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()); + range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key); if (covering_seq > *max_covering_tombstone_seq) { *max_covering_tombstone_seq = covering_seq; if (timestamp) { @@ -1345,9 +1374,9 @@ bool MemTable::Get(const LookupKey& key, PinnableSlice* value, bool may_contain = true; #if defined(TOPLINGDB_WITH_TIMESTAMP) size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); - Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz); + Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key, ts_sz); #else - Slice user_key_without_ts = key.user_key(); + Slice user_key_without_ts = key.user_key; #endif bool bloom_checked = false; // when both memtable_whole_key_filtering and prefix_extractor_ are set, @@ -1380,7 +1409,7 @@ bool MemTable::Get(const LookupKey& key, PinnableSlice* value, saver.status = s; saver.found_final_value = false; saver.merge_in_progress = s->IsMergeInProgress(); - saver.key = &key; + saver.key = key.user_key; saver.value = value; saver.columns = columns; saver.timestamp = timestamp; @@ -1402,7 +1431,7 @@ bool MemTable::Get(const LookupKey& key, PinnableSlice* value, if (LIKELY(value != nullptr)) { value->Reset(); } - table_->Get(read_opts, key, &saver, SaveValue); + table_->GetPIK(read_opts, key, &saver, SaveValue); *seq = saver.seq; // No change to value, since we have not yet found a Put/Delete @@ -1460,10 +1489,10 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, if (!no_range_del) { std::unique_ptr range_del_iter( NewRangeTombstoneIteratorInternal( - read_options, GetInternalKeySeqno(iter->lkey->internal_key()), + read_options, iter->ikey.sequence, immutable_memtable)); SequenceNumber covering_seq = - range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()); + range_del_iter->MaxCoveringTombstoneSeqnum(iter->ikey.user_key); if (covering_seq > iter->max_covering_tombstone_seq) { iter->max_covering_tombstone_seq = covering_seq; if (iter->timestamp) { @@ -1478,7 +1507,7 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, saver.status = iter->s; saver.found_final_value = false; saver.merge_in_progress = iter->s->IsMergeInProgress(); - saver.key = iter->lkey; + saver.key = iter->ikey.user_key; saver.value = iter->value; // not null if (saver.value) saver.value->Reset(); @@ -1499,7 +1528,7 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, saver.allow_data_in_errors = moptions_.allow_data_in_errors; saver.is_zero_copy = read_options.internal_is_in_pinning_section; saver.needs_user_key_cmp_in_get = needs_user_key_cmp_in_get_; - table_->Get(read_options, *(iter->lkey), &saver, SaveValue); + table_->GetPIK(read_options, iter->ikey, &saver, SaveValue); if (!saver.found_final_value && saver.merge_in_progress) { *(iter->s) = Status::MergeInProgress(); diff --git a/db/memtable.h b/db/memtable.h index e04a456d4c..04549a3187 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -9,8 +9,6 @@ #pragma once #include -#include -#include #include #include #include @@ -23,15 +21,12 @@ #include "db/version_edit.h" #include "memory/allocator.h" #include "memory/concurrent_arena.h" -#include "monitoring/instrumented_mutex.h" #include "options/cf_options.h" #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" #include "table/internal_iterator.h" #include "table/multiget_context.h" #include "util/dynamic_bloom.h" -#include "util/hash.h" -#include "util/hash_containers.h" #if defined(TOPLINGDB_WITH_TIMESTAMP) #include @@ -139,6 +134,10 @@ class MemTable : public CacheAlignedNewDelete { const char* prefix_len_key2) const override; virtual int operator()(const char* prefix_len_key, const DecodedType& key) const override; + virtual int operator()(const char* prefix_len_key, + const ParsedInternalKey&) const override; + virtual int operator()(const ParsedInternalKey&, + const char* prefix_len_key) const override; virtual const InternalKeyComparator* icomparator() const override; }; @@ -306,7 +305,7 @@ class MemTable : public CacheAlignedNewDelete { // @param immutable_memtable Whether this memtable is immutable. Used // internally by NewRangeTombstoneIterator(). See comment above // NewRangeTombstoneIterator() for more detail. - bool Get(const LookupKey& key, PinnableSlice* value, + bool Get(const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, @@ -314,7 +313,7 @@ class MemTable : public CacheAlignedNewDelete { ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, bool do_merge = true); - bool Get(const LookupKey& key, PinnableSlice* value, + bool Get(const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, @@ -698,7 +697,8 @@ class MemTable : public CacheAlignedNewDelete { const SliceTransform* insert_with_hint_prefix_extractor_; // Insert hints for each prefix. - UnorderedMapH insert_hints_; + // UnorderedMapH insert_hints_; + terark::hash_strmap insert_hints_; // Timestamp of oldest key std::atomic oldest_key_time_; diff --git a/db/memtable_list.cc b/db/memtable_list.cc index deeccae575..9eacca84e9 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -110,7 +110,7 @@ int MemTableList::NumFlushed() const { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. // Operands stores the list of merge operations to apply, so far. -bool MemTableListVersion::Get(const LookupKey& key, PinnableSlice* value, +bool MemTableListVersion::Get(const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, @@ -135,7 +135,7 @@ void MemTableListVersion::MultiGet(const ReadOptions& read_options, } bool MemTableListVersion::GetMergeOperands( - const LookupKey& key, Status* s, MergeContext* merge_context, + const ParsedInternalKey& key, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) { for (MemTable* memtable : memlist_) { bool done = memtable->Get( @@ -150,7 +150,7 @@ bool MemTableListVersion::GetMergeOperands( } bool MemTableListVersion::GetFromHistory( - const LookupKey& key, PinnableSlice* value, PinnableWideColumns* columns, + const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) { @@ -160,7 +160,7 @@ bool MemTableListVersion::GetFromHistory( } bool MemTableListVersion::GetFromList( - std::list* list, const LookupKey& key, PinnableSlice* value, + std::list* list, const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback, diff --git a/db/memtable_list.h b/db/memtable_list.h index 328d160e83..6630046b68 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -57,14 +57,14 @@ class MemTableListVersion { // If any operation was found for this key, its most recent sequence number // will be stored in *seq on success (regardless of whether true/false is // returned). Otherwise, *seq will be set to kMaxSequenceNumber. - bool Get(const LookupKey& key, PinnableSlice* value, + bool Get(const ParsedInternalKey&, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, bool* is_blob_index = nullptr); - bool Get(const LookupKey& key, PinnableSlice* value, + bool Get(const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, @@ -83,7 +83,7 @@ class MemTableListVersion { // Returns all the merge operands corresponding to the key by searching all // memtables starting from the most recent one. - bool GetMergeOperands(const LookupKey& key, Status* s, + bool GetMergeOperands(const ParsedInternalKey&, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts); @@ -92,13 +92,13 @@ class MemTableListVersion { // have already been flushed. Should only be used from in-memory only // queries (such as Transaction validation) as the history may contain // writes that are also present in the SST files. - bool GetFromHistory(const LookupKey& key, PinnableSlice* value, + bool GetFromHistory(const ParsedInternalKey&, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index = nullptr); - bool GetFromHistory(const LookupKey& key, PinnableSlice* value, + bool GetFromHistory(const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, @@ -162,7 +162,7 @@ class MemTableListVersion { // Return true if memtable is trimmed bool TrimHistory(autovector* to_delete, size_t usage); - bool GetFromList(std::list* list, const LookupKey& key, + bool GetFromList(std::list* list, const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* s, MergeContext* merge_context, diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 1766456791..1314a7557c 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -34,6 +34,12 @@ static auto g_cspp_fac = []()-> std::shared_ptr { return nullptr; }(); +struct HideLookupKey : ParsedInternalKey { + HideLookupKey(Slice uk, uint64_t seq) : + ParsedInternalKey(uk, seq, kValueTypeForSeek) {} +}; +#define LookupKey HideLookupKey + class MemTableListTest : public testing::Test { public: std::string dbname; diff --git a/db/repair.cc b/db/repair.cc index 69c662d22c..2fd9992b28 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -401,7 +401,16 @@ class Repairer { std::string scratch; Slice record; WriteBatch batch; - if (db_options_.memtable_as_log_index) { + bool wal_memtable_format = db_options_.memtable_as_log_index; + if (db_options_.check_wal_format) { + if (IOStatus ios = log::Reader::IsMemTableAsLogIndexFile + (*fs, logname, &wal_memtable_format); !ios.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "%s: %s", + logname.c_str(), *ios.ToSSO()); + return Status(ios); + } + } + if (wal_memtable_format) { reader.InitSetMemTableAsLogIndex(*fs); auto [fmap, ios] = ReadonlyFileMmap::New(*fs, log, logname); if (!ios.ok() && ios.ToString() != "Invalid argument: Empty File") diff --git a/db/table_cache.cc b/db/table_cache.cc index 0dd427049e..3544cb0d61 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -439,7 +439,8 @@ bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, Status TableCache::GetWithRowCache( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + const FileMetaData& file_meta, const ParsedInternalKey& pik, + GetContext* get_context, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, int level, @@ -453,6 +454,8 @@ Status TableCache::GetWithRowCache( // Check row cache if enabled. // Reuse row_cache_key sequence number when row cache hits. if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { + const auto ikbuf = pik.MakeInternalKeyBuf(); + const Slice k = ikbuf; auto user_key = ExtractUserKey(k); uint64_t cache_entry_seq_no = CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); @@ -486,7 +489,7 @@ Status TableCache::GetWithRowCache( t->NewRangeTombstoneIterator(options)); if (range_del_iter != nullptr) { SequenceNumber seq = - range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)); + range_del_iter->MaxCoveringTombstoneSeqnum(pik.user_key); if (seq > *max_covering_tombstone_seq) { *max_covering_tombstone_seq = seq; if (get_context->NeedTimestamp()) { @@ -498,7 +501,7 @@ Status TableCache::GetWithRowCache( } if (s.ok()) { get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. - s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters); + s = t->GetPIK(options, pik, get_context, prefix_extractor.get(), skip_filters); get_context->SetReplayLog(nullptr); } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { // Couldn't find Table in cache but treat as kFound if no_io set diff --git a/db/table_cache.h b/db/table_cache.h index ee85f7ba4e..fc97e10ff3 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -116,7 +116,7 @@ class TableCache { Status Get( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + const FileMetaData& file_meta, const ParsedInternalKey& k, GetContext* get_context, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, @@ -136,7 +136,7 @@ class TableCache { Status GetWithRowCache( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + const FileMetaData& file_meta, const ParsedInternalKey&, GetContext*, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, @@ -144,7 +144,7 @@ class TableCache { Status GetNoneRowCache( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + const FileMetaData& file_meta, const ParsedInternalKey&, GetContext*, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, @@ -329,7 +329,8 @@ __always_inline Status TableCache::GetNoneRowCache( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + const FileMetaData& file_meta, const ParsedInternalKey& pik, + GetContext* get_context, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, int level, @@ -355,19 +356,21 @@ Status TableCache::GetNoneRowCache( std::unique_ptr range_del_iter( t->NewRangeTombstoneIterator(options)); if (range_del_iter != nullptr) { - auto seq = range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)); + auto seq = range_del_iter->MaxCoveringTombstoneSeqnum(pik.user_key); if (seq > *max_covering_tombstone_seq) { *max_covering_tombstone_seq = seq; + #if defined(TOPLINGDB_WITH_TIMESTAMP) if (get_context->NeedTimestamp()) { get_context->SetTimestampFromRangeTombstone(range_del_iter->timestamp()); } + #endif } } } if (LIKELY(handle == nullptr)) { // optimize for compiler tail call - return t->Get(options, k, get_context, prefix_extractor.get(), skip_filters); + return t->GetPIK(options, pik, get_context, prefix_extractor.get(), skip_filters); } else { - Status s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters); + Status s = t->GetPIK(options, pik, get_context, prefix_extractor.get(), skip_filters); cache_.Release(handle); return s; } diff --git a/db/table_cache_sync_and_async.h b/db/table_cache_sync_and_async.h index 8ff03ec501..16b11094c0 100644 --- a/db/table_cache_sync_and_async.h +++ b/db/table_cache_sync_and_async.h @@ -40,7 +40,8 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) // sequence numbers, we cannot use it if we need to fetch the sequence. if (lookup_row_cache) { GetContext* first_context = first_key.get_context; - CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context, + const auto first_ik = first_key.InternalKeyBuf(); + CreateRowCacheKeyPrefix(options, fd, first_ik, first_context, row_cache_key); row_cache_key_prefix_size = row_cache_key.Size(); diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index 729420a8ec..7683c4b357 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -288,10 +288,20 @@ Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) { return s; } assert(file); + const std::string& fname = file->file_name(); + bool wal_memtable_format = options_->memtable_as_log_index; + if (options_->check_wal_format) { + if (IOStatus ios = log::Reader::IsMemTableAsLogIndexFile + (*options_->fs, fname, &wal_memtable_format); !ios.ok()) { + ROCKS_LOG_WARN(options_->info_log, "%s: %s", + fname.c_str(), *ios.ToSSO()); + return Status(ios); + } + } current_log_reader_.reset( new log::Reader(options_->info_log, std::move(file), &reporter_, read_options_.verify_checksums_, log_file->LogNumber())); - if (options_->memtable_as_log_index) { + if (wal_memtable_format) { current_log_reader_->InitSetMemTableAsLogIndex(*options_->fs); } return Status::OK(); diff --git a/db/version_set.cc b/db/version_set.cc index 38a7fcfe79..573c57e970 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -109,9 +109,28 @@ namespace { #define __builtin_prefetch(ptr) #endif +inline uint64_t HostPrefixCache(const ParsedInternalKey& ikey) { + if (LIKELY(ikey.user_key.size_ >= 8)) { + uint64_t data = GetUnalignedU64(ikey.user_key.data_); + return NativeOfBigEndian64(data); + } else { + #if defined(__AVX512VL__) && defined(__AVX512BW__) + //#pragma message "__AVX512VL__ && __AVX512BW__, use _mm_maskz_loadu_epi8" + // load 128 bits, keep low 64 bits, discard high 64 bits + auto mask = _bzhi_u32(-1, uint32_t(ikey.user_key.size_)); + auto m128 = _mm_maskz_loadu_epi8(mask, ikey.user_key.data_); + uint64_t data = (uint64_t)_mm_extract_epi64(m128, 0); + #else + uint64_t data = 0; + memcpy(&data, ikey.user_key.data_, ikey.user_key.size_); + #endif + return NativeOfBigEndian64(data); + } +} + template size_t FindFileInRangeTmpl(Cmp cmp, const LevelFilesBrief& brief, - Slice key, size_t lo, size_t hi) { + const ParsedInternalKey& key, size_t lo, size_t hi) { const uint64_t* pxcache = brief.prefix_cache; const uint64_t key_prefix = HostPrefixCache(key); const FdWithKeyRange* a = brief.files; @@ -141,7 +160,7 @@ size_t FindFileInRangeTmpl(Cmp cmp, const LevelFilesBrief& brief, static size_t FindFileInRangeTmpl(FallbackVirtCmp cmp, const LevelFilesBrief& brief, - Slice key, size_t lo, size_t hi) { + const ParsedInternalKey& key, size_t lo, size_t hi) { const FdWithKeyRange* a = brief.files; while (lo < hi) { size_t mid = (lo + hi) / 2; @@ -157,7 +176,7 @@ template static ROCKSDB_FLATTEN int FindFileInRangeInst(const InternalKeyComparator* icmp, const LevelFilesBrief& brief, - Slice key, size_t lo, size_t hi) { + const ParsedInternalKey& key, size_t lo, size_t hi) { return (int)FindFileInRangeTmpl(Cmp{icmp}, brief, key, lo, hi); } @@ -167,7 +186,7 @@ int FindFileInRangeInst(const InternalKeyComparator* icmp, __attribute_noinline__ #endif int FindFileInRange(const InternalKeyComparator& icmp, - const LevelFilesBrief& file_level, const Slice& key, + const LevelFilesBrief& file_level, const ParsedInternalKey& key, uint32_t left, uint32_t right) { #ifdef TOPLINGDB_NO_OPT_FindFileInRange #pragma message "TOPLINGDB_NO_OPT_FindFileInRange is defined, intended for benchmark baseline" @@ -236,217 +255,6 @@ Status OverlapWithIterator(const Comparator* ucmp, // levels. Therefore we are guaranteed that if we find data // in a smaller level, later levels are irrelevant (unless we // are MergeInProgress). -template -class FilePicker { - __always_inline - int FindFileInRange(const InternalKeyComparator& icmp, - const LevelFilesBrief& file_level, const Slice& key, - size_t left, size_t right) { - return (int)FindFileInRangeTmpl(IKCmp{&icmp}, file_level, key, left, right); - } - public: - FilePicker(const Slice& user_key, const Slice& ikey, - autovector* file_levels, unsigned int num_levels, - FileIndexer* file_indexer, const Comparator* user_comparator, - const InternalKeyComparator* internal_comparator) - : num_levels_(num_levels), - curr_level_(static_cast(-1)), - returned_file_level_(static_cast(-1)), - hit_file_level_(static_cast(-1)), - search_left_bound_(0), - search_right_bound_(FileIndexer::kLevelMaxIndex), - level_files_brief_(file_levels), - is_hit_file_last_in_level_(false), - curr_file_level_(nullptr), - user_key_(user_key), - ikey_(ikey), - file_indexer_(file_indexer), - user_comparator_(user_comparator), - internal_comparator_(internal_comparator) { - // Setup member variables to search first level. - search_ended_ = !PrepareNextLevel(); - if (!search_ended_) { - // Prefetch Level 0 table data to avoid cache miss if possible. - for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { - auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; - if (r) { - r->Prepare(ikey); - } - } - } - } - - int GetCurrentLevel() const { return curr_level_; } - - FdWithKeyRange* GetNextFile() { - UKCmp cmp{user_comparator_}; - while (!search_ended_) { // Loops over different levels. - while (curr_index_in_curr_level_ < curr_file_level_->num_files) { - // Loops over all files in current level. - FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_]; - hit_file_level_ = curr_level_; - is_hit_file_last_in_level_ = - curr_index_in_curr_level_ == curr_file_level_->num_files - 1; - int cmp_largest = -1; - - // Do key range filtering of files or/and fractional cascading if: - // (1) not all the files are in level 0, or - // (2) there are more than 3 current level files - // If there are only 3 or less current level files in the system, we - // skip the key range filtering. In this case, more likely, the system - // is highly tuned to minimize number of tables queried by each query, - // so it is unlikely that key range filtering is more efficient than - // querying the files. - if (num_levels_ > 1 || curr_file_level_->num_files > 3) { - // Check if key is within a file's range. If search left bound and - // right bound point to the same find, we are sure key falls in - // range. - assert(curr_level_ == 0 || - curr_index_in_curr_level_ == start_index_in_curr_level_ || - cmp(user_key_, ExtractUserKey(f->smallest_key)) <= 0); - - int cmp_smallest = cmp(user_key_, ExtractUserKey(f->smallest_key)); - if (cmp_smallest >= 0) { - cmp_largest = cmp(user_key_, ExtractUserKey(f->largest_key)); - } - - // Setup file search bound for the next level based on the - // comparison results - if (curr_level_ > 0) { - file_indexer_->GetNextLevelIndex( - curr_level_, curr_index_in_curr_level_, cmp_smallest, - cmp_largest, &search_left_bound_, &search_right_bound_); - } - // Key falls out of current file's range - if (cmp_smallest < 0 || cmp_largest > 0) { - if (curr_level_ == 0) { - ++curr_index_in_curr_level_; - continue; - } else { - // Search next level. - break; - } - } - } - - returned_file_level_ = curr_level_; - if (curr_level_ > 0 && cmp_largest < 0) { - // No more files to search in this level. - search_ended_ = !PrepareNextLevel(); - } else { - ++curr_index_in_curr_level_; - } - return f; - } - // Start searching next level. - search_ended_ = !PrepareNextLevel(); - } - // Search ended. - return nullptr; - } - - // getter for current file level - // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts - unsigned int GetHitFileLevel() { return hit_file_level_; } - - // Returns true if the most recent "hit file" (i.e., one returned by - // GetNextFile()) is at the last index in its level. - bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } - - private: - unsigned int num_levels_; - unsigned int curr_level_; - unsigned int returned_file_level_; - unsigned int hit_file_level_; - int32_t search_left_bound_; - int32_t search_right_bound_; - autovector* level_files_brief_; - bool search_ended_; - bool is_hit_file_last_in_level_; - LevelFilesBrief* curr_file_level_; - unsigned int curr_index_in_curr_level_; - unsigned int start_index_in_curr_level_; - Slice user_key_; - Slice ikey_; - FileIndexer* file_indexer_; - const Comparator* user_comparator_; - const InternalKeyComparator* internal_comparator_; - - // Setup local variables to search next level. - // Returns false if there are no more levels to search. - bool PrepareNextLevel() { - curr_level_++; - while (curr_level_ < num_levels_) { - curr_file_level_ = &(*level_files_brief_)[curr_level_]; - if (curr_file_level_->num_files == 0) { - // When current level is empty, the search bound generated from upper - // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is - // also empty. - assert(search_left_bound_ == 0); - assert(search_right_bound_ == -1 || - search_right_bound_ == FileIndexer::kLevelMaxIndex); - // Since current level is empty, it will need to search all files in - // the next level - search_left_bound_ = 0; - search_right_bound_ = FileIndexer::kLevelMaxIndex; - curr_level_++; - continue; - } - - // Some files may overlap each other. We find - // all files that overlap user_key and process them in order from - // newest to oldest. In the context of merge-operator, this can occur at - // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes - // are always compacted into a single entry). - int32_t start_index; - if (curr_level_ == 0) { - // On Level-0, we read through all files to check for overlap. - start_index = 0; - } else { - // On Level-n (n>=1), files are sorted. Binary search to find the - // earliest file whose largest key >= ikey. Search left bound and - // right bound are used to narrow the range. - if (search_left_bound_ <= search_right_bound_) { - if (search_right_bound_ == FileIndexer::kLevelMaxIndex) { - search_right_bound_ = - static_cast(curr_file_level_->num_files) - 1; - } - // `search_right_bound_` is an inclusive upper-bound, but since it was - // determined based on user key, it is still possible the lookup key - // falls to the right of `search_right_bound_`'s corresponding file. - // So, pass a limit one higher, which allows us to detect this case. - start_index = - FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_, - static_cast(search_left_bound_), - static_cast(search_right_bound_) + 1); - if (start_index == search_right_bound_ + 1) { - // `ikey_` comes after `search_right_bound_`. The lookup key does - // not exist on this level, so let's skip this level and do a full - // binary search on the next level. - search_left_bound_ = 0; - search_right_bound_ = FileIndexer::kLevelMaxIndex; - curr_level_++; - continue; - } - } else { - // search_left_bound > search_right_bound, key does not exist in - // this level. Since no comparison is done in this level, it will - // need to search all files in the next level. - search_left_bound_ = 0; - search_right_bound_ = FileIndexer::kLevelMaxIndex; - curr_level_++; - continue; - } - } - start_index_in_curr_level_ = start_index; - curr_index_in_curr_level_ = start_index; - - return true; - } - // curr_level_ = num_levels_. So, no more levels to search. - return false; - } -}; } // anonymous namespace class FilePickerMultiGet { @@ -494,7 +302,7 @@ class FilePickerMultiGet { auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; if (r) { for (auto iter = range_.begin(); iter != range_.end(); ++iter) { - r->Prepare(iter->ikey); + r->PreparePIK(iter->ikey); } } } @@ -877,7 +685,7 @@ class FilePickerMultiGet { // key falls to the right of `search_right_bound_`'s corresponding // file. So, pass a limit one higher, which allows us to detect this // case. - Slice& ikey = mget_iter->ikey; + auto& ikey = mget_iter->ikey; start_index = FindFileInRange( *internal_comparator_, *curr_file_level_, ikey, static_cast(fp_ctx.search_left_bound), @@ -947,6 +755,12 @@ Version::~Version() { int FindFile(const InternalKeyComparator& icmp, const LevelFilesBrief& file_level, const Slice& key) { + return FindFileInRange(icmp, file_level, ParsedInternalKey(key), 0, + static_cast(file_level.num_files)); +} + +int FindFile(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const ParsedInternalKey& key) { return FindFileInRange(icmp, file_level, key, 0, static_cast(file_level.num_files)); } @@ -1209,12 +1023,11 @@ class LevelIterator final : public InternalIterator { iw->prepare_and_get_value_ = ForgeFuncPtr(this, &LevelIterator::PrepareAndGetValue); } else { - iw->work_iter_ = file_iter_.iter(); - iw->value_iter_ = file_iter_.iter(); - iw->next_and_get_result_ = ForgeFuncPtr(file_iter_.iter(), - &InternalIterator::NextAndGetResult); - iw->prepare_and_get_value_ = ForgeFuncPtr(file_iter_.iter(), - &InternalIterator::PrepareAndGetValue); + auto fi = file_iter_.iter(); + iw->work_iter_ = fi; + iw->value_iter_ = fi; + iw->next_and_get_result_ = ForgeFuncPtr(fi, &InternalIterator::NextAndGetResult); + iw->prepare_and_get_value_ = ForgeFuncPtr(fi, &InternalIterator::PrepareAndGetValue); } retry_already_goes_invalid_ = false; } @@ -2518,14 +2331,19 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, if (IsForwardBytewiseComparator(user_comparator())) { m_get = ExtractFuncPtr(this, &Version::GetInst ); + m_get_no_watch = ExtractFuncPtr(this, &Version::GetInst + ); } else if (IsReverseBytewiseComparator(user_comparator())) { m_get = ExtractFuncPtr(this, &Version::GetInst ); + m_get_no_watch = ExtractFuncPtr(this, &Version::GetInst + ); } else { m_get = ExtractFuncPtr(this, &Version::GetInst ); + m_get_no_watch = m_get; // do not instantiate more } } } @@ -2659,9 +2477,9 @@ void Version::MultiGetBlob( } } -template +template ROCKSDB_FLATTEN -void Version::GetInst(const ReadOptions& read_options, const LookupKey& k, +void Version::GetInst(const ReadOptions& read_options, const ParsedInternalKey& ikey, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* status, MergeContext* merge_context, @@ -2669,8 +2487,7 @@ void Version::GetInst(const ReadOptions& read_options, const LookupKey& k, PinnedIteratorsManager* pinned_iters_mgr, bool* value_found, bool* key_exists, SequenceNumber* seq, ReadCallback* callback, bool* is_blob, bool do_merge) { - Slice ikey = k.internal_key(); - Slice user_key = k.user_key(); + const Slice& user_key = ikey.user_key; assert(status->ok() || status->IsMergeInProgress()); @@ -2680,10 +2497,18 @@ void Version::GetInst(const ReadOptions& read_options, const LookupKey& k, } uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; +#if defined(TOPLINGDB_WITH_FABRICATED_COMPLEXITY) if (vset_ && vset_->block_cache_tracer_ && vset_->block_cache_tracer_->is_tracing_enabled()) { tracing_get_id = vset_->block_cache_tracer_->NextGetId(); } +#endif +#if !defined(TOPLINGDB_WITH_TIMESTAMP) + timestamp = nullptr; // tell compiler it is always null +#endif +#if !defined(TOPLINGDB_WITH_WIDE_COLUMNS) + columns = nullptr; // tell compiler it is always null +#endif // Note: the old StackableDB-based BlobDB passes in // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we @@ -2707,45 +2532,172 @@ void Version::GetInst(const ReadOptions& read_options, const LookupKey& k, pinned_iters_mgr->StartPinning(); } - FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_, - storage_info_.num_non_empty_levels_, - &storage_info_.file_indexer_, user_comparator(), - internal_comparator()); - FdWithKeyRange* f = fp.GetNextFile(); +// FilePicker is a negative optimization, revert it! - while (f != nullptr) { +#if defined(ROCKSDB_UNIT_TEST) +// Prefetch Level 0 table data to avoid cache miss if possible. +if (storage_info_.num_non_empty_levels_ > 0 && + storage_info_.level_files_brief_[0].num_files > 0) { + for (size_t i = 0; i < storage_info_.level_files_brief_[0].num_files; ++i) { + if (auto r = storage_info_.level_files_brief_[0].files[i].fd.table_reader) { + r->PreparePIK(ikey); + } + } +} +#endif +int32_t search_left_bound = 0; +int32_t search_right_bound = FileIndexer::kLevelMaxIndex; +for (int curr_level = 0; curr_level < storage_info_.num_non_empty_levels_; curr_level++) { + LevelFilesBrief* curr_file_level = &storage_info_.level_files_brief_[curr_level]; + if (curr_file_level->num_files == 0) { + // When current level is empty, the search bound generated from upper + // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is + // also empty. + assert(search_left_bound == 0); + assert(search_right_bound == -1 || + search_right_bound == FileIndexer::kLevelMaxIndex); + // Since current level is empty, it will need to search all files in + // the next level + search_left_bound = 0; + search_right_bound = FileIndexer::kLevelMaxIndex; + continue; + } + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, this can occur at + // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes + // are always compacted into a single entry). + int32_t start_index; + if (curr_level == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + } else { + // On Level-n (n>=1), files are sorted. Binary search to find the + // earliest file whose largest key >= ikey. Search left bound and + // right bound are used to narrow the range. + if (search_left_bound <= search_right_bound) { + if (search_right_bound == FileIndexer::kLevelMaxIndex) { + search_right_bound = + static_cast(curr_file_level->num_files) - 1; + } + // `search_right_bound` is an inclusive upper-bound, but since it was + // determined based on user key, it is still possible the lookup key + // falls to the right of `search_right_bound`'s corresponding file. + // So, pass a limit one higher, which allows us to detect this case. + start_index = static_cast(FindFileInRangeTmpl( + IKCmp{internal_comparator()}, *curr_file_level, ikey, + static_cast(search_left_bound), + static_cast(search_right_bound) + 1)); + if (start_index == search_right_bound + 1) { + // `ikey` comes after `search_right_bound`. The lookup key does + // not exist on this level, so let's skip this level and do a full + // binary search on the next level. + search_left_bound = 0; + search_right_bound = FileIndexer::kLevelMaxIndex; + continue; + } + } else { + // search_left_bound > search_right_bound, key does not exist in + // this level. Since no comparison is done in this level, it will + // need to search all files in the next level. + search_left_bound = 0; + search_right_bound = FileIndexer::kLevelMaxIndex; + continue; + } + } + unsigned int start_index_in_curr_level __attribute__((unused)) = start_index; + unsigned int curr_index_in_curr_level = start_index; + while (curr_index_in_curr_level < curr_file_level->num_files) { + FdWithKeyRange* f = &curr_file_level->files[curr_index_in_curr_level]; + int hit_file_level = curr_level; + bool is_hit_file_last_in_level = + curr_index_in_curr_level == curr_file_level->num_files - 1; + (void)is_hit_file_last_in_level; + int cmp_largest = -1; + + // Do key range filtering of files or/and fractional cascading if: + // (1) not all the files are in level 0, or + // (2) there are more than 3 current level files + // If there are only 3 or less current level files in the system, we + // skip the key range filtering. In this case, more likely, the system + // is highly tuned to minimize number of tables queried by each query, + // so it is unlikely that key range filtering is more efficient than + // querying the files. + if (storage_info_.num_non_empty_levels_ > 1 || curr_file_level->num_files > 3) { + // Check if key is within a file's range. If search left bound and + // right bound point to the same find, we are sure key falls in + // range. + UKCmp ucmp{user_comparator()}; + assert(curr_level == 0 || + curr_index_in_curr_level == start_index_in_curr_level || + ucmp(ikey.user_key, ExtractUserKey(f->smallest_key)) <= 0); + + int cmp_smallest = ucmp(ikey.user_key, ExtractUserKey(f->smallest_key)); + if (cmp_smallest >= 0) { + cmp_largest = ucmp(ikey.user_key, ExtractUserKey(f->largest_key)); + } + + // Setup file search bound for the next level based on the + // comparison results + if (curr_level > 0) { + storage_info_.file_indexer_.GetNextLevelIndex( + curr_level, curr_index_in_curr_level, cmp_smallest, + cmp_largest, &search_left_bound, &search_right_bound); + } + // Key falls out of current file's range + if (cmp_smallest < 0 || cmp_largest > 0) { + if (curr_level == 0) { + ++curr_index_in_curr_level; + continue; + } else { + // Search next level. + break; + } + } + } + + // File passed filtering, process it if (*max_covering_tombstone_seq > 0) { // The remaining files we look at will only contain covered keys, so we // stop here. - break; + goto search_complete; } if (get_context.sample()) { sample_file_read_inc(f->file_metadata); } bool timer_enabled = + !std::is_same_v && perf_level >= PerfLevel::kEnableTimeExceptForMutex && get_perf_context()->per_level_perf_context_enabled; StopWatchNano timer(clock_, timer_enabled /* auto_start */); - *status = table_cache_->Get( + Status s2 = table_cache_->Get( read_options, *internal_comparator(), *f->file_metadata, ikey, &get_context, mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor, - cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - IsFilterSkipped(static_cast(fp.GetHitFileLevel()), - fp.IsHitFileLastInLevel()), - fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_); + #if defined(TOPLINGDB_WITH_FABRICATED_COMPLEXITY) + cfd_->internal_stats()->GetFileReadHist(hit_file_level), + IsFilterSkipped(static_cast(hit_file_level), + is_hit_file_last_in_level), + #else + nullptr, + false, + #endif + hit_file_level, max_file_size_for_l0_meta_pin_); // TODO: examine the behavior for corrupted key if (timer_enabled) { PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), - fp.GetHitFileLevel()); + hit_file_level); } - if (!status->ok()) { + if (UNLIKELY(!s2.ok())) { + *status = std::move(s2); if (db_statistics_ != nullptr) { get_context.ReportCounters(); } return; } + status->SetAsOK(); // report the counters before returning if (get_context.State() != GetContext::kNotFound && @@ -2761,16 +2713,16 @@ void Version::GetInst(const ReadOptions& read_options, const LookupKey& k, // TODO: update per-level perfcontext user_key_return_count for kMerge break; case GetContext::kFound: - if (fp.GetHitFileLevel() == 0) { + if (hit_file_level == 0) { RecordTick(db_statistics_, GET_HIT_L0); - } else if (fp.GetHitFileLevel() == 1) { + } else if (hit_file_level == 1) { RecordTick(db_statistics_, GET_HIT_L1); - } else if (fp.GetHitFileLevel() >= 2) { + } else if (hit_file_level >= 2) { RecordTick(db_statistics_, GET_HIT_L2_AND_UP); } PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, - fp.GetHitFileLevel()); + hit_file_level); if (is_blob_index && do_merge && (value || columns)) { Slice blob_index = @@ -2821,8 +2773,16 @@ void Version::GetInst(const ReadOptions& read_options, const LookupKey& k, *status = Status::Corruption(Status::SubCode::kMergeOperatorFailed); return; } - f = fp.GetNextFile(); + + // Move to next file or level + if (curr_level > 0 && cmp_largest < 0) { + // No more files to search in this level. + break; + } + ++curr_index_in_curr_level; } +} +search_complete: if (db_statistics_ != nullptr) { get_context.ReportCounters(); } @@ -3071,7 +3031,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) { GetContext& get_context = *iter->get_context; Status* status = iter->s; - Slice user_key = iter->lkey->user_key(); + const Slice& user_key = iter->ikey.user_key; if (db_statistics_ != nullptr) { get_context.ReportCounters(); @@ -4901,7 +4861,7 @@ uint64_t VersionStorageInfo::NumLevelRawKV(int level) const { int VersionStorageInfo::FindFileInRange(int level, const Slice& key, uint32_t left, uint32_t right) const { return ROCKSDB_NAMESPACE::FindFileInRange(*internal_comparator_, - level_files_brief_[level], key, left, right); + level_files_brief_[level], ParsedInternalKey(key), left, right); } const char* VersionStorageInfo::LevelSummary( @@ -7125,7 +7085,7 @@ VersionSet::ApproximateSizeTmpl(const SizeApproximationOptions& options, // identify the file position for start key const int idx_start = - (int)FindFileInRangeTmpl(cmp, files_brief, start, 0, + (int)FindFileInRangeTmpl(cmp, files_brief, ParsedInternalKey(start), 0, static_cast(files_brief.num_files - 1)); assert(static_cast(idx_start) < files_brief.num_files); @@ -7133,7 +7093,7 @@ VersionSet::ApproximateSizeTmpl(const SizeApproximationOptions& options, int idx_end = idx_start; if (cmp(files_brief.files[idx_end].largest_key, end)) { idx_end = - (int)FindFileInRangeTmpl(cmp, files_brief, end, idx_start, + (int)FindFileInRangeTmpl(cmp, files_brief, ParsedInternalKey(end), idx_start, static_cast(files_brief.num_files - 1)); } assert(idx_end >= idx_start && diff --git a/db/version_set.h b/db/version_set.h index 647ae00fbf..2c4b3bf7e5 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -101,6 +101,9 @@ using VersionEditParams = VersionEdit; extern int FindFile(const InternalKeyComparator& icmp, const LevelFilesBrief& file_level, const Slice& key); +extern int FindFile(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const ParsedInternalKey&); + // Returns true iff some file in "files" overlaps the user key range // [*smallest,*largest]. // smallest==nullptr represents a key smaller than all keys in the DB. @@ -894,7 +897,9 @@ class Version { // merge_context.operands_list and don't merge the operands // REQUIRES: lock is not held // REQUIRES: pinned_iters_mgr != nullptr - void Get(const ReadOptions& ro, const LookupKey& key, PinnableSlice* value, + template + void Get(const ReadOptions& ro, const ParsedInternalKey& key, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* status, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, @@ -903,14 +908,18 @@ class Version { SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, bool* is_blob = nullptr, bool do_merge = true) { - return m_get(this, ro, key, value, columns, timestamp, status, + auto f_get = std::is_same_v + ? m_get_no_watch : m_get; + return f_get(this, ro, key, value, columns, timestamp, status, merge_context, max_covering_tombstone_seq, pinned_iters_mgr, value_found, key_exists, seq, callback, is_blob, do_merge); } private: - template - void GetInst(const ReadOptions&, const LookupKey& key, PinnableSlice* value, + template + void GetInst(const ReadOptions&, const ParsedInternalKey&, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* status, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, @@ -920,7 +929,7 @@ class Version { bool* is_blob, bool do_merge); void (*m_get)(Version*, - const ReadOptions&, const LookupKey& key, PinnableSlice* value, + const ReadOptions&, const ParsedInternalKey&, PinnableSlice* value, PinnableWideColumns* columns, std::string* timestamp, Status* status, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, @@ -928,6 +937,7 @@ class Version { bool* value_found, bool* key_exists, SequenceNumber* seq, ReadCallback* callback, bool* is_blob, bool do_merge); + decltype(m_get) m_get_no_watch; public: diff --git a/db/version_set_sync_and_async.h b/db/version_set_sync_and_async.h index 75776b620c..ec107252c3 100644 --- a/db/version_set_sync_and_async.h +++ b/db/version_set_sync_and_async.h @@ -145,7 +145,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) continue; case GetContext::kCorrupt: *status = - Status::Corruption("corrupted key for ", iter->lkey->user_key()); + Status::Corruption("corrupted key for ", iter->ikey.user_key); file_range.MarkKeyDone(iter); continue; case GetContext::kUnexpectedBlobIndex: diff --git a/db/wal_manager.cc b/db/wal_manager.cc index f3ff882325..66e1fc7cde 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -485,9 +485,18 @@ Status WalManager::ReadFirstLine(const std::string& fname, reporter.fname = fname.c_str(); reporter.status = &status; reporter.ignore_error = !db_options_.paranoid_checks; + bool wal_memtable_format = db_options_.memtable_as_log_index; + if (db_options_.check_wal_format) { + if (IOStatus ios = log::Reader::IsMemTableAsLogIndexFile + (*db_options_.fs, fname, &wal_memtable_format); !ios.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "%s: %s", + fname.c_str(), *ios.ToSSO()); + return Status(ios); + } + } log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, true /*checksum*/, number); - if (db_options_.memtable_as_log_index) { + if (wal_memtable_format) { reader.InitSetMemTableAsLogIndex(*db_options_.fs); } std::string scratch; diff --git a/db_bench.sh b/db_bench.sh index 3f0c11b30a..4e0fbc5244 100644 --- a/db_bench.sh +++ b/db_bench.sh @@ -12,20 +12,23 @@ export ROCKSDB_KICK_OUT_OPTIONS_FILE=1 ulimit -n 100000 args=( -json sideplugin/rockside/sample-conf/db_bench_enterprise.yaml - -num=10000000 -key_size=8 - -value_size=2000 + #-num=10000000 + -key_size=8 + #-value_size=2000 -batch_size=100 #-benchmarks=fillseq,compact,nextwithkey,nextwithkey,nextwithkey,nextwithkey,nextwithkey,readseq,readseq,readseq,readseq,readseq -benchmarks=fillrandom,readrandom - #-benchmarks=fillseq,compact - #-benchmarks=compact -use_existing_db + #-benchmarks=fillseq,compact,readrandom # rand DB::Get < 100 nanosec + #-benchmarks=compact #-benchmarks=readrandom #-benchmarks=readseq #-benchmarks=nextwithkey #-wkey_file=${HOME}/wikipedia-title-seq.txt #-rkey_file=${HOME}/wikipedia-title-seq.txt #-threads=8 - -scan_omit_key -scan_omit_value + #-use_existing_db + -scan_omit_key + -scan_omit_value -enable_zero_copy # ToplingDB specific, for point search by Get/MultiGet ) ./db_bench ${args[@]} "$@" diff --git a/easy_db_bench.sh b/easy_db_bench.sh new file mode 100644 index 0000000000..e24cbfe743 --- /dev/null +++ b/easy_db_bench.sh @@ -0,0 +1,34 @@ +#!/bin/bash -ex + +#rm -rf /dev/shm/db_bench_enterprise +#rm -rf /tmp/db_bench_enterprise +mkdir -p /dev/shm/db_bench_enterprise +#mkdir -p /tmp/db_bench_enterprise +cp sideplugin/rockside/src/topling/web/{index.html,style.css} /dev/shm/db_bench_enterprise/ + +export TOPLINGDB_GetContext_sampling=kNone +export ROCKSDB_KICK_OUT_OPTIONS_FILE=1 +export TOPLINGDB_EASY_MIGRATE_CONF=sideplugin/rockside/sample-conf/db_bench_enterprise.yaml +#export PRINT_NOT_FOUND=true +ulimit -n 100000 +args=( + #-num=10000000 + -key_size=8 + #-value_size=2000 + -batch_size=100 + #-benchmarks=fillseq,compact,nextwithkey,nextwithkey,nextwithkey,nextwithkey,nextwithkey,readseq,readseq,readseq,readseq,readseq + -benchmarks=fillrandom,readrandom + #-benchmarks=fillseq,compact,readrandom # rand DB::Get < 100 nanosec + #-benchmarks=compact + #-benchmarks=readrandom + #-benchmarks=readseq + #-benchmarks=nextwithkey + #-wkey_file=${HOME}/wikipedia-title-seq.txt + #-rkey_file=${HOME}/wikipedia-title-seq.txt + #-threads=8 + #-use_existing_db + -scan_omit_key + -scan_omit_value + -enable_zero_copy # ToplingDB specific, for point search by Get/MultiGet +) +./db_bench ${args[@]} "$@" diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 263b40491a..2ed5667240 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -67,6 +67,7 @@ extern "C" { #include #include #include +#include /* Exported types */ @@ -143,6 +144,7 @@ typedef struct rocksdb_statistics_histogram_data_t rocksdb_statistics_histogram_data_t; typedef struct rocksdb_wait_for_compact_options_t rocksdb_wait_for_compact_options_t; +typedef struct rocksdb_stdstr_t rocksdb_stdstr_t; #if !defined(ROCKSDB_C_API_IMPLEMENTATION) struct rocksdb_slice_t { @@ -1941,6 +1943,8 @@ extern ROCKSDB_LIBRARY_API_WEAK void rocksdb_readoptions_start_pin( rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API_WEAK void rocksdb_readoptions_finish_pin( rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API_WEAK unsigned char +rocksdb_readoptions_is_in_pinning_section(rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API_WEAK void rocksdb_readoptions_set_async_queue_depth( rocksdb_readoptions_t*, size_t); extern ROCKSDB_LIBRARY_API_WEAK size_t rocksdb_readoptions_get_async_queue_depth( @@ -2970,6 +2974,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_pinnableslice_destroy( extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value( const rocksdb_pinnableslice_t* t, size_t* vlen); +extern ROCKSDB_LIBRARY_API rocksdb_stdstr_t* rocksdb_stdstr_create(const char* str, size_t len); +extern ROCKSDB_LIBRARY_API void rocksdb_stdstr_destroy(rocksdb_stdstr_t* v); + extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t* rocksdb_memory_consumers_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db( @@ -3083,6 +3090,9 @@ extern ROCKSDB_LIBRARY_API_WEAK side_plugin_repo_t* side_plugin_repo_create(void extern ROCKSDB_LIBRARY_API_WEAK void side_plugin_repo_import_auto_file (side_plugin_repo_t*, const char* fname, char** errptr); +extern ROCKSDB_LIBRARY_API_WEAK void +side_plugin_repo_import(side_plugin_repo_t*, const char* json_str, char** errptr); + extern ROCKSDB_LIBRARY_API_WEAK rocksdb_t* side_plugin_repo_open(side_plugin_repo_t*, rocksdb_column_family_handle_t***, size_t* num_cf, char** errptr); @@ -3102,10 +3112,120 @@ side_plugin_repo_get_cf_options(side_plugin_repo_t*, const char* name, char** er extern ROCKSDB_LIBRARY_API_WEAK void side_plugin_repo_put_cf_options(side_plugin_repo_t*, const char* name, rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API_WEAK bool +side_plugin_db_options_update_from(rocksdb_options_t*, const side_plugin_repo_t*, const char* name); + +extern ROCKSDB_LIBRARY_API_WEAK bool +side_plugin_cf_options_update_from(rocksdb_options_t*, const side_plugin_repo_t*, const char* name); + extern ROCKSDB_LIBRARY_API_WEAK void side_plugin_repo_close_all(side_plugin_repo_t*); +extern ROCKSDB_LIBRARY_API_WEAK void side_plugin_repo_forget_db(side_plugin_repo_t*, rocksdb_t*); + extern ROCKSDB_LIBRARY_API_WEAK const char* rocksdb_get_name(rocksdb_t*); +struct side_plugin_ex_vtab_t { + // serialize_request == NULL means serde(all the 4) are not supported + void (* serialize_request )(FILE*, const void* obj); + void (*deserialize_request )(FILE*, void* obj); + void (* serialize_response)(FILE*, const void* obj); + void (*deserialize_response)(FILE*, void* obj); + + // web_view == NULL means web view and update are not supported + // web_update == NULL means web only update is not supported + rocksdb_stdstr_t* (*web_view)(const void* obj, const char* dump_options_json, const side_plugin_repo_t*); + void (*web_update)(void* obj, const char* dump_options_json, const char* body_json, const side_plugin_repo_t*); +}; +#if !defined(__cplusplus) +typedef struct side_plugin_ex_vtab_t side_plugin_ex_vtab_t; +#endif + +typedef const rocksdb_comparator_t* +(*rocksdb_comparator_creator_t) +(const char* strjson, const side_plugin_repo_t* repo); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_register_comparator +(const char* name, rocksdb_comparator_creator_t, const side_plugin_ex_vtab_t*); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_unregister_comparator(const char* name); + +extern ROCKSDB_LIBRARY_API_WEAK +void* side_plugin_comparator_get_state(const rocksdb_comparator_t*); + +typedef rocksdb_mergeoperator_t* +(*rocksdb_mergeoperator_creator_t) +(const char* strjson, const side_plugin_repo_t* repo); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_register_merge_operator +(const char* name, rocksdb_mergeoperator_creator_t, const side_plugin_ex_vtab_t*); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_unregister_merge_operator(const char* name); + +extern ROCKSDB_LIBRARY_API_WEAK +void* side_plugin_mergeoperator_get_state(const rocksdb_mergeoperator_t*); + +typedef rocksdb_compactionfilterfactory_t* +(*rocksdb_compactionfilterfactory_creator_t) +(const char* strjson, const side_plugin_repo_t* repo); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_register_compaction_filter_factory +(const char* name, rocksdb_compactionfilterfactory_creator_t, const side_plugin_ex_vtab_t*); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_unregister_compaction_filter_factory(const char* name); + +extern ROCKSDB_LIBRARY_API_WEAK +void* side_plugin_compactionfilterfactory_get_state(const rocksdb_compactionfilterfactory_t*); + +typedef rocksdb_slicetransform_t* +(*rocksdb_slicetransform_creator_t) +(const char* strjson, const side_plugin_repo_t* repo); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_register_slicetransform +(const char* name, rocksdb_slicetransform_creator_t, const side_plugin_ex_vtab_t*); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_unregister_slicetransform(const char* name); + +extern ROCKSDB_LIBRARY_API_WEAK +void* side_plugin_slicetransform_get_state(const rocksdb_slicetransform_t*); + +typedef rocksdb_filterpolicy_t* +(*rocksdb_filterpolicy_creator_t) +(const char* strjson, const side_plugin_repo_t* repo); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_register_filterpolicy +(const char* name, rocksdb_filterpolicy_creator_t, const side_plugin_ex_vtab_t*); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_unregister_filterpolicy(const char* name); + +extern ROCKSDB_LIBRARY_API_WEAK +void* side_plugin_filterpolicy_get_state(const rocksdb_filterpolicy_t*); + +#if 0 // rocksdb c api does not support custom rate limiter +typedef rocksdb_ratelimiter_t* +(*rocksdb_ratelimiter_creator_t) +(const char* strjson, const side_plugin_repo_t* repo); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_register_ratelimiter +(const char* name, rocksdb_ratelimiter_creator_t, const side_plugin_ex_vtab_t*); + +extern ROCKSDB_LIBRARY_API_WEAK +void side_plugin_unregister_ratelimiter(const char* name); + +extern ROCKSDB_LIBRARY_API_WEAK +void* side_plugin_ratelimiter_get_state(const rocksdb_ratelimiter_t*); +#endif + #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 36888a9bd8..5ddfc6d76a 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -83,6 +83,14 @@ class MemTableRep : public CacheAlignedNewDelete { virtual int operator()(const char* prefix_len_key, const Slice& key) const = 0; + // Compare prefix_len_key (encoded internal key) with user_key + tag + virtual int operator()(const char* prefix_len_key, + const struct ParsedInternalKey&) const = 0; + + // Compare user_key + tag with prefix_len_key (encoded internal key) + virtual int operator()(const struct ParsedInternalKey&, + const char* prefix_len_key) const = 0; + virtual const InternalKeyComparator* icomparator() const = 0; virtual ~KeyComparator() {} @@ -242,6 +250,10 @@ class MemTableRep : public CacheAlignedNewDelete { const LookupKey&, void* callback_args, bool (*callback_func)(void* arg, const KeyValuePair&)) = 0; + virtual void GetPIK(const struct ReadOptions&, + const struct ParsedInternalKey&, void* callback_args, + bool (*callback_func)(void* arg, const KeyValuePair&)); + virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, const Slice& /*end_key*/) { return 0; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 739724d8ba..eb3f17909c 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -925,6 +925,18 @@ struct DBOptions { bool memtable_as_log_index = false; + // If true, each WAL file is probed on DB open to auto-detect its on-disk + // format, so recovery works even when memtable_as_log_index was changed + // between runs. + // + // Defaults to false because the probe relies on CRC32 self-consistency + // rather than a magic number to distinguish the two formats, which carries + // a 1/2^32 false-positive risk per file. Always probing would expose this + // risk on every open; turning this on only when needed reduces the + // exposure by orders of magnitude (only when a format switch actually + // occurred). + bool check_wal_format = false; + // if not zero, periodically take stats snapshots and store in memory, the // memory size for stats snapshots is capped at stats_history_buffer_size // Default: 1MB @@ -1875,7 +1887,7 @@ struct ReadOptions { ~ScopePinIfNotPinned() { if (ro_) ro_->FinishPin(); } }; - ReadOptions() {} + ReadOptions(); ReadOptions(bool _verify_checksums, bool _fill_cache); explicit ReadOptions(Env::IOActivity _io_activity); ReadOptions(const ReadOptions&, BooleanDontCopyTrue/*dispatch_tag*/); diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h index 01f892f81c..5395e66697 100644 --- a/include/rocksdb/slice.h +++ b/include/rocksdb/slice.h @@ -33,6 +33,10 @@ namespace ROCKSDB_NAMESPACE { class Slice { public: + typedef char value_type; + typedef const char &const_reference, &reference; + typedef const char *const_iterator, *iterator; + // Create an empty slice. Slice() : data_(""), size_(0) {} diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index e6afa6fa09..02d6fa49c4 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -29,6 +29,7 @@ #endif #include "rocksdb/slice.h" +#include namespace ROCKSDB_NAMESPACE { @@ -495,6 +496,7 @@ class Status { // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; + terark::minimal_sso<32> ToSSO() const; void swap(Status& y) { static_assert(sizeof(Status) == 2*sizeof(uint64_t)); diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index 9dceb1997c..0e4e1c73b1 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -127,6 +127,7 @@ jlongArray rocksdb_open_helper( [](const char* str_data, const size_t str_len) { return std::string(str_data, str_len); }, + terark::_rvref * [&jco, &column_families](size_t idx, std::string cf_name) { ROCKSDB_NAMESPACE::ColumnFamilyOptions* cf_options = reinterpret_cast(jco[idx]); diff --git a/java/rocksjni/transaction.cc b/java/rocksjni/transaction.cc index 8a8438671b..3b408d315d 100644 --- a/java/rocksjni/transaction.cc +++ b/java/rocksjni/transaction.cc @@ -154,11 +154,6 @@ void Java_org_rocksdb_Transaction_rollbackToSavePoint(JNIEnv* env, } } -typedef std::function - FnGet; - /* * Class: org_rocksdb_Transaction * Method: get diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc index 1fe2083d99..178ca0203f 100644 --- a/java/rocksjni/ttl.cc +++ b/java/rocksjni/ttl.cc @@ -81,6 +81,7 @@ jlongArray Java_org_rocksdb_TtlDB_openCF(JNIEnv* env, jclass, jlong jopt_handle, [](const char* str_data, const size_t str_len) { return std::string(str_data, str_len); }, + terark::_rvref * [&jco, &column_families](size_t idx, std::string cf_name) { ROCKSDB_NAMESPACE::ColumnFamilyOptions* cf_options = reinterpret_cast(jco[idx]); diff --git a/options/db_options.cc b/options/db_options.cc index 4a1663e4eb..dc17320e4e 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -328,6 +328,10 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, memtable_as_log_index), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"check_wal_format", + {offsetof(struct ImmutableDBOptions, check_wal_format), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"fail_if_options_file_error", {offsetof(struct ImmutableDBOptions, fail_if_options_file_error), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -764,6 +768,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io), persist_stats_to_disk(options.persist_stats_to_disk), memtable_as_log_index(options.memtable_as_log_index), + check_wal_format(options.check_wal_format), write_dbid_to_manifest(options.write_dbid_to_manifest), log_readahead_size(options.log_readahead_size), file_checksum_gen_factory(options.file_checksum_gen_factory), @@ -933,6 +938,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { persist_stats_to_disk); ROCKS_LOG_HEADER(log, " Options.memtable_as_log_index: %u", memtable_as_log_index); + ROCKS_LOG_HEADER(log, " Options.check_wal_format: %u", + check_wal_format); ROCKS_LOG_HEADER(log, " Options.write_dbid_to_manifest: %d", write_dbid_to_manifest); ROCKS_LOG_HEADER( diff --git a/options/db_options.h b/options/db_options.h index 024af355a7..e0c618d637 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -90,6 +90,7 @@ struct ImmutableDBOptions { bool avoid_unnecessary_blocking_io; bool persist_stats_to_disk; bool memtable_as_log_index; + bool check_wal_format; bool write_dbid_to_manifest; size_t log_readahead_size; std::shared_ptr file_checksum_gen_factory; diff --git a/options/options.cc b/options/options.cc index 05e800e5d8..ee7199e11d 100644 --- a/options/options.cc +++ b/options/options.cc @@ -718,6 +718,9 @@ DBOptions* DBOptions::IncreaseParallelism(int total_threads) { static const bool g_cache_sst_file_iter = terark::getEnvBool("TOPLINGDB_CACHE_SST_FILE_ITER", false); +ReadOptions::ReadOptions() { + cache_sst_file_iter = g_cache_sst_file_iter; +} ReadOptions::ReadOptions(bool _verify_checksums, bool _fill_cache) : verify_checksums(_verify_checksums), fill_cache(_fill_cache) { cache_sst_file_iter = g_cache_sst_file_iter; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 42340073f9..2cd675a16a 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -338,6 +338,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "stats_persist_period_sec=54321;" "persist_stats_to_disk=true;" "memtable_as_log_index=true;" + "check_wal_format=true;" "stats_history_buffer_size=14159;" "allow_fallocate=true;" "allow_mmap_reads=false;" diff --git a/options/options_test.cc b/options/options_test.cc index a92d8f844c..450f38c400 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -170,6 +170,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"stats_persist_period_sec", "57"}, {"persist_stats_to_disk", "false"}, {"memtable_as_log_index", "false"}, + {"check_wal_format", "false"}, {"stats_history_buffer_size", "69"}, {"advise_random_on_open", "true"}, {"use_adaptive_mutex", "false"}, @@ -354,6 +355,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U); ASSERT_EQ(new_db_opt.persist_stats_to_disk, false); ASSERT_EQ(new_db_opt.memtable_as_log_index, false); + ASSERT_EQ(new_db_opt.check_wal_format, false); ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U); ASSERT_EQ(new_db_opt.advise_random_on_open, true); ASSERT_EQ(new_db_opt.use_adaptive_mutex, false); @@ -2392,6 +2394,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"stats_persist_period_sec", "57"}, {"persist_stats_to_disk", "false"}, {"memtable_as_log_index", "false"}, + {"check_wal_format", "false"}, {"stats_history_buffer_size", "69"}, {"advise_random_on_open", "true"}, {"use_adaptive_mutex", "false"}, @@ -2578,6 +2581,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U); ASSERT_EQ(new_db_opt.persist_stats_to_disk, false); ASSERT_EQ(new_db_opt.memtable_as_log_index, false); + ASSERT_EQ(new_db_opt.check_wal_format, false); ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U); ASSERT_EQ(new_db_opt.advise_random_on_open, true); ASSERT_EQ(new_db_opt.use_adaptive_mutex, false); diff --git a/sideplugin/rockside b/sideplugin/rockside index 07d3333f80..345c8cf538 160000 --- a/sideplugin/rockside +++ b/sideplugin/rockside @@ -1 +1 @@ -Subproject commit 07d3333f80e154f14299a4877a13ed1801ff93b5 +Subproject commit 345c8cf5382ccfe7be0665f8478aa1f96f36ed67 diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h index e7621909cc..0cbe31b7e4 100644 --- a/table/block_based/block_based_table_reader_sync_and_async.h +++ b/table/block_based/block_based_table_reader_sync_and_async.h @@ -408,8 +408,9 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) for (auto miter = data_block_range.begin(); miter != data_block_range.end(); ++miter) { - const Slice& key = miter->ikey; - iiter->Seek(miter->ikey); + const auto ikbuf = miter->InternalKeyBuf(); + const Slice key = ikbuf; + iiter->Seek(key); IndexValue v; if (iiter->Valid()) { @@ -580,7 +581,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) ++miter) { Status s; GetContext* get_context = miter->get_context; - const Slice& key = miter->ikey; + const auto ikbuf = miter->InternalKeyBuf(); + const Slice key = ikbuf; bool matched = false; // if such user key matched a key in SST bool done = false; bool first_block = true; diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index 254546893f..86bb7b780c 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -417,6 +417,7 @@ TEST_P(BlockBasedTableReaderTest, MultiGet) { autovector get_context; autovector key_context; autovector sorted_keys; + get_context.reserve(keys.size()); for (size_t i = 0; i < keys.size(); ++i) { get_context.emplace_back(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, ExtractUserKey(keys[i]), diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index b14858c020..6e400f25a4 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -120,7 +120,8 @@ class FilterBlockReader { const ReadOptions& read_options) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey_without_ts = iter->ukey_without_ts; - const Slice ikey = iter->ikey; + const auto ikbuf = iter->InternalKeyBuf(); + const Slice ikey = ikbuf; // convert from named ikbuf GetContext* const get_context = iter->get_context; if (!KeyMayMatch(ukey_without_ts, no_io, &ikey, get_context, lookup_context, read_options)) { @@ -145,7 +146,8 @@ class FilterBlockReader { const ReadOptions& read_options) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey_without_ts = iter->ukey_without_ts; - const Slice ikey = iter->ikey; + const auto ikbuf = iter->InternalKeyBuf(); + const Slice ikey = ikbuf; // convert from named ikbuf GetContext* const get_context = iter->get_context; if (prefix_extractor->InDomain(ukey_without_ts) && !PrefixMayMatch(prefix_extractor->Transform(ukey_without_ts), no_io, diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 817fe94245..565ca6da82 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -138,6 +138,12 @@ class PartitionedFilterBlockReader BlockHandle GetFilterPartitionHandle( const CachableEntry& filter_block, const Slice& entry) const; + BlockHandle GetFilterPartitionHandle( + const CachableEntry& filter_block, + const ParsedInternalKey& entry) const { + // overload this function can minimize diff, the caller need not change + return GetFilterPartitionHandle(filter_block, entry.MakeInternalKeyBuf()); + } Status GetFilterPartitionBlock( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle, bool no_io, GetContext* get_context, diff --git a/table/get_context.cc b/table/get_context.cc index c2a4dc887b..9b1dd47026 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -61,6 +61,9 @@ GetContext::GetContext( if (seq) { *seq = kMaxSequenceNumber; } + if (statistics) { + new(&get_context_stats_)GetContextStats(); + } switch (g_how_sampling) { case GetContextSampleRead::kAlways: sample_ = true; break; case GetContextSampleRead::kNone : sample_ = false; break; @@ -133,6 +136,9 @@ void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) { } void GetContext::ReportCounters() { + if (!statistics_) { + return; + } if (get_context_stats_.num_cache_hit > 0) { RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit); } diff --git a/table/get_context.h b/table/get_context.h index 8579d08982..dbea262d6f 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -77,7 +77,9 @@ class GetContext { kUnexpectedBlobIndex, kMergeOperatorFailed, }; + union { GetContextStats get_context_stats_; + }; // Constructor // @param value Holds the value corresponding to user_key. If its nullptr diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index cde6320eb0..f9ccc4a57e 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -56,10 +56,8 @@ class IteratorWrapperBase { result_.is_valid = false; } else { #if TOPLING_USE_BOUND_PMF - next_and_get_result_ = ExtractFuncPtr - (_iter, &InternalIteratorBase::NextAndGetResult); - prepare_and_get_value_ = ExtractFuncPtr - (_iter, &InternalIteratorBase::PrepareAndGetValue); + next_and_get_result_ = ForgeFuncPtr(_iter, &InternalIteratorBase::NextAndGetResult); + prepare_and_get_value_ = ForgeFuncPtr(_iter, &InternalIteratorBase::PrepareAndGetValue); #endif Update(); } @@ -289,10 +287,8 @@ class ThinIteratorWrapperBase { iter_ = i; if (i) { #if TOPLING_USE_BOUND_PMF - next_and_get_result_ = ExtractFuncPtr - (i, &InternalIteratorBase::NextAndGetResult); - prepare_and_get_value_ = ExtractFuncPtr - (i, &InternalIteratorBase::PrepareAndGetValue); + next_and_get_result_ = ForgeFuncPtr(i, &InternalIteratorBase::NextAndGetResult); + prepare_and_get_value_ = ForgeFuncPtr(i, &InternalIteratorBase::PrepareAndGetValue); #endif } return old_iter; diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index f2a7f8995a..ea257d5e0a 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -150,14 +150,7 @@ class MaxHeapItemComparator { const InternalKeyComparator* comparator_; }; -#if defined(_MSC_VER) /* Visual Studio */ -#define FORCE_INLINE __forceinline -#define __bswap_64 _byteswap_uint64 -#elif defined(__GNUC__) -#define FORCE_INLINE inline __attribute__((always_inline)) -#else -#define FORCE_INLINE inline -#endif +#define FORCE_INLINE terark_forceinline #if defined(__AVX512VL__) && defined(__AVX512BW__) // can be defined as 23 or 16 @@ -272,6 +265,7 @@ struct UintPrefix { unsigned char data[MERGE_ITER_PREFIX_LEN] = {0}; UintPrefix(int=0) {} }; +static_assert(sizeof(UintPrefix) == MERGE_ITER_PREFIX_LEN); #endif // MERGE_ITER_PREFIX_LEN diff --git a/table/multiget_context.h b/table/multiget_context.h index 8c7beb2c11..0aafd460b3 100644 --- a/table/multiget_context.h +++ b/table/multiget_context.h @@ -25,11 +25,25 @@ class GetContext; class PinnableWideColumns; struct KeyContext { +#if defined(TOPLINGDB_WITH_TIMESTAMP) const Slice* key; LookupKey* lkey; - Slice ukey_with_ts; + union { + ParsedInternalKey ikey; + Slice ukey_with_ts; // at ikey.user_key + }; Slice ukey_without_ts; - Slice ikey; + // long live & fast + auto InternalKeyBuf() const { return lkey->internal_key(); } +#else + union { + ParsedInternalKey ikey; + Slice ukey_with_ts; // at ikey.user_key + Slice ukey_without_ts; // at ikey.user_key + }; + // temporary & slow + auto InternalKeyBuf() const { return ikey.MakeInternalKeyBuf(); } +#endif ColumnFamilyHandle* column_family; Status* s; MergeContext merge_context; @@ -45,8 +59,13 @@ struct KeyContext { KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key, PinnableSlice* val, PinnableWideColumns* cols, std::string* ts, Status* stat) +#if defined(TOPLINGDB_WITH_TIMESTAMP) : key(&user_key), lkey(nullptr), + ukey_without_ts(user_key), // must init +#else + : ukey_without_ts(user_key), // keep ikey.tag raw mem +#endif column_family(col_family), s(stat), max_covering_tombstone_seq(0), @@ -113,8 +132,11 @@ class MultiGetContext { Statistics* stats) : num_keys_(num_keys), value_mask_(0), - value_size_(0), + value_size_(0) +#if defined(TOPLINGDB_WITH_TIMESTAMP) + , lookup_key_ptr_(reinterpret_cast(lookup_key_stack_buf)) +#endif #if USE_COROUTINES , reader_(fs, stats), @@ -124,41 +146,44 @@ class MultiGetContext { (void)fs; (void)stats; assert(num_keys <= MAX_BATCH_SIZE); +#if defined(TOPLINGDB_WITH_TIMESTAMP) if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) { lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]); lookup_key_ptr_ = reinterpret_cast(lookup_key_heap_buf.get()); } +#endif + ROCKSDB_ASSERT_LE(begin + num_keys, sorted_keys->size()); for (size_t iter = 0; iter != num_keys_; ++iter) { // autovector may not be contiguous storage, so make a copy sorted_keys_[iter] = (*sorted_keys)[begin + iter]; + #if defined(TOPLINGDB_WITH_TIMESTAMP) sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter]) LookupKey(*sorted_keys_[iter]->key, snapshot, read_opts.timestamp); sorted_keys_[iter]->ukey_with_ts = sorted_keys_[iter]->lkey->user_key(); - #if defined(TOPLINGDB_WITH_TIMESTAMP) sorted_keys_[iter]->ukey_without_ts = StripTimestampFromUserKey( sorted_keys_[iter]->lkey->user_key(), read_opts.timestamp == nullptr ? 0 : read_opts.timestamp->size()); - #else - sorted_keys_[iter]->ukey_without_ts = sorted_keys_[iter]->lkey->user_key(); - #endif - - sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key(); - - #if defined(TOPLINGDB_WITH_TIMESTAMP) sorted_keys_[iter]->timestamp = (*sorted_keys)[begin + iter]->timestamp; + #else + static_assert(offsetof(KeyContext, ikey.user_key) == offsetof(KeyContext, ukey_without_ts)); #endif + static_assert(offsetof(KeyContext, ikey.user_key) == offsetof(KeyContext, ukey_with_ts)); + sorted_keys_[iter]->ikey.sequence = snapshot; + sorted_keys_[iter]->ikey.type = kValueTypeForSeek; sorted_keys_[iter]->get_context = (*sorted_keys)[begin + iter]->get_context; } } +#if defined(TOPLINGDB_WITH_TIMESTAMP) ~MultiGetContext() { for (size_t i = 0; i < num_keys_; ++i) { lookup_key_ptr_[i].~LookupKey(); } } +#endif #if USE_COROUTINES SingleThreadExecutor& executor() { return executor_; } @@ -168,15 +193,19 @@ class MultiGetContext { private: static const int MAX_LOOKUP_KEYS_ON_STACK = 16; +#if defined(TOPLINGDB_WITH_TIMESTAMP) alignas( alignof(LookupKey)) char lookup_key_stack_buf[sizeof(LookupKey) * MAX_LOOKUP_KEYS_ON_STACK]; +#endif std::array sorted_keys_; size_t num_keys_; Mask value_mask_; uint64_t value_size_; +#if defined(TOPLINGDB_WITH_TIMESTAMP) std::unique_ptr lookup_key_heap_buf; LookupKey* lookup_key_ptr_; +#endif #if USE_COROUTINES AsyncFileReader reader_; SingleThreadExecutor executor_; diff --git a/table/table_reader.h b/table/table_reader.h index 3a2c46c62e..8db7ade1d9 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -113,6 +113,9 @@ class TableReader : public CacheAlignedNewDelete { // Prepare work that can be done before the real Get() virtual void Prepare(const Slice& /*target*/) {} + virtual void PreparePIK(const ParsedInternalKey& pik) { + Prepare(pik.MakeInternalKeyBuf()); + } // Report an approximation of how much memory has been used. virtual size_t ApproximateMemoryUsage() const = 0; @@ -133,6 +136,14 @@ class TableReader : public CacheAlignedNewDelete { const SliceTransform* prefix_extractor, bool skip_filters = false) = 0; + virtual Status GetPIK(const ReadOptions& ro, const ParsedInternalKey& pik, + GetContext* get_context, + const SliceTransform* prefix_extractor, + bool skip_filters = false) { + auto ikbuf = pik.MakeInternalKeyBuf(); + return Get(ro, ikbuf, get_context, prefix_extractor, skip_filters); + } + // Use bloom filters in the table file, if present, to filter out keys. The // mget_range will be updated to skip keys that get a negative result from // the filter lookup. @@ -147,7 +158,7 @@ class TableReader : public CacheAlignedNewDelete { const SliceTransform* prefix_extractor, bool skip_filters = false) { for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) { - *iter->s = Get(readOptions, iter->ikey, iter->get_context, + *iter->s = GetPIK(readOptions, iter->ikey, iter->get_context, prefix_extractor, skip_filters); } } diff --git a/table/table_test.cc b/table/table_test.cc index 33d823920d..56f60f94d2 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -3751,14 +3751,14 @@ TEST_P(BlockBasedTableTest, TracingMultiGetTest) { /*PinnableWideColumns omitted*/ nullptr, /*timestamp omitted*/ nullptr, statuses.data()); key_context[0].ukey_without_ts = ukeys[0]; - key_context[0].ikey = encoded_keys[0]; + key_context[0].ikey = ParsedInternalKey(encoded_keys[0]); key_context[0].get_context = get_contexts.data(); key_context.emplace_back(/*ColumnFamilyHandle omitted*/ nullptr, ukeys[1], &values[1], /*PinnableWideColumns omitted*/ nullptr, /*timestamp omitted*/ nullptr, &statuses[1]); key_context[1].ukey_without_ts = ukeys[1]; - key_context[1].ikey = encoded_keys[1]; + key_context[1].ikey = ParsedInternalKey(encoded_keys[1]); key_context[1].get_context = &get_contexts[1]; autovector sorted_keys; sorted_keys.push_back(&key_context[0]); diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 5fe196c7dc..33752d8b75 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -2749,9 +2749,23 @@ void DumpWalFile(Options options, std::string wal_file, bool print_header, // bogus input, carry on as best we can log_number = 0; } + bool wal_memtable_format = options.memtable_as_log_index; + if (options.check_wal_format) { + if (IOStatus ios = log::Reader::IsMemTableAsLogIndexFile + (*fs, wal_file, &wal_memtable_format); !ios.ok()) { + if (exec_state) { + *exec_state = LDBCommandExecuteResult::Failed( + "Failed to detect WAL format " + ios.ToString()); + } else { + std::cerr << "Error: Failed to detect WAL format " + << ios.ToString() << std::endl; + } + return; + } + } log::Reader reader(options.info_log, std::move(wal_file_reader), &reporter, true /* checksum */, log_number); - if (options.memtable_as_log_index) { + if (wal_memtable_format) { reader.InitSetMemTableAsLogIndex(*fs); } std::string scratch; diff --git a/util/status.cc b/util/status.cc index 160755d54d..978f315b13 100644 --- a/util/status.cc +++ b/util/status.cc @@ -160,4 +160,8 @@ std::string Status::ToString() const { return result; } +terark::minimal_sso<32> Status::ToSSO() const { + return terark::minimal_sso<32>{ToString()}; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/util/stop_watch.h b/util/stop_watch.h index 105a99c1fa..8d2ea93e14 100644 --- a/util/stop_watch.h +++ b/util/stop_watch.h @@ -15,6 +15,9 @@ #endif namespace ROCKSDB_NAMESPACE { + +class StopWatchNano; + // Auto-scoped. // When statistics is not nullptr, records the measured time into any enabled // histograms supplied to the constructor. A histogram argument may be omitted @@ -23,6 +26,7 @@ namespace ROCKSDB_NAMESPACE { // added to *elapsed if overwrite is false. class StopWatch { public: + typedef StopWatchNano WatchNano; inline StopWatch(SystemClock* clock, Statistics* statistics, const uint32_t hist_type) noexcept : @@ -223,6 +227,7 @@ class StopWatchNano { }; struct FakeStopWatch { + typedef FakeStopWatch WatchNano; FakeStopWatch(...) {} void DelayStart() {} void DelayStop() {} diff --git a/util/string_util.cc b/util/string_util.cc index b47140cbdc..02213c8ef0 100644 --- a/util/string_util.cc +++ b/util/string_util.cc @@ -118,13 +118,13 @@ std::string NumberToHumanString(int64_t num) { char buf[19]; int64_t absnum = num < 0 ? -num : num; if (absnum < 10000) { - snprintf(buf, sizeof(buf), "%" PRIi64, num); + snprintf(buf, sizeof(buf), "%4" PRIi64 " ", num); } else if (absnum < 10000000) { - snprintf(buf, sizeof(buf), "%" PRIi64 "K", num / 1000); + snprintf(buf, sizeof(buf), "%4" PRIi64 "K", num / 1000); } else if (absnum < 10000000000LL) { - snprintf(buf, sizeof(buf), "%" PRIi64 "M", num / 1000000); + snprintf(buf, sizeof(buf), "%4" PRIi64 "M", num / 1000000); } else { - snprintf(buf, sizeof(buf), "%" PRIi64 "G", num / 1000000000); + snprintf(buf, sizeof(buf), "%4" PRIi64 "G", num / 1000000000); } return std::string(buf); } @@ -144,7 +144,7 @@ std::string BytesToHumanString(uint64_t bytes) { } char buf[20]; - snprintf(buf, sizeof(buf), "%.2f %s", final_size, size_name[size_idx]); + snprintf(buf, sizeof(buf), "%7.2f %s", final_size, size_name[size_idx]); return std::string(buf); } diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc index b6fe039036..1bcf605884 100644 --- a/utilities/blob_db/blob_db.cc +++ b/utilities/blob_db/blob_db.cc @@ -46,6 +46,9 @@ Status BlobDB::Open(const DBOptions& db_options, return Status::NotSupported( "Blob DB doesn't support non-default column family."); } + MaybeOptionsUpdateFrom(const_cast(&db_options), + const_cast*>(&column_families), + dbname); BlobDBImpl* blob_db_impl = new BlobDBImpl(dbname, bdb_options, db_options, column_families[0].options); diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index 4a370fcb53..ca71ac2763 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -28,6 +28,8 @@ namespace ROCKSDB_NAMESPACE { +using terark::_rvref; + struct WriteOptions; std::atomic PessimisticTransaction::txn_id_counter_(1); @@ -692,7 +694,7 @@ Status WriteCommittedTxn::CommitWithoutPrepareInternal() { Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf)); Status s = - wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { + wb->UpdateTimestamps(commit_ts, _rvref*[wbwi, this](uint32_t cf) -> size_t { auto cf_iter = cfs_with_ts_tracked_when_indexing_disabled_.find(cf); if (cf_iter != cfs_with_ts_tracked_when_indexing_disabled_.end()) { return sizeof(kMaxTxnTimestamp); @@ -776,7 +778,7 @@ Status WriteCommittedTxn::CommitInternal() { s = WriteBatchInternal::MarkCommitWithTimestamp(working_batch, name_, commit_ts); if (s.ok()) { - s = wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { + s = wb->UpdateTimestamps(commit_ts, _rvref*[wbwi, this](uint32_t cf) -> size_t { if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) != cfs_with_ts_tracked_when_indexing_disabled_.end()) { return sizeof(kMaxTxnTimestamp);