From e24ecd833c25719c0a99da90eb4609e6f7e182b4 Mon Sep 17 00:00:00 2001 From: Mincho Paskalev Date: Thu, 25 Sep 2025 13:50:56 +0300 Subject: [PATCH 1/7] Client compression for replication --- .github/workflows/ci.yml | 26 +- .github/workflows/daily.yml | 138 ++- redis.conf | 10 + src/Makefile | 14 +- src/ae.h | 2 +- src/client_comp.c | 919 +++++++++++++++++++ src/client_comp.h | 42 + src/cluster_asm.c | 12 +- src/cluster_legacy.c | 2 +- src/config.c | 2 + src/connection.c | 2 + src/connection.h | 81 +- src/iothread.c | 101 +- src/networking.c | 94 +- src/replication.c | 110 ++- src/server.c | 11 + src/server.h | 21 +- src/socket.c | 33 +- src/tls.c | 2 + src/unix.c | 2 + tests/instances.tcl | 3 + tests/integration/psync2-master-restart.tcl | 5 +- tests/integration/replication-buffer.tcl | 36 +- tests/integration/replication-rdbchannel.tcl | 33 +- tests/integration/replication.tcl | 12 +- tests/support/test.tcl | 3 + tests/support/util.tcl | 33 +- tests/test_helper.tcl | 3 + 28 files changed, 1618 insertions(+), 134 deletions(-) create mode 100644 src/client_comp.c create mode 100644 src/client_comp.h diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 92eb4e29621..74a71520f6a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,9 @@ jobs: - name: make # Fail build if there are warnings # build with TLS just for compilation coverage - run: make REDIS_CFLAGS='-Werror' BUILD_TLS=yes + run: | + sudo apt-get install -y libzstd-dev + make REDIS_CFLAGS='-Werror' BUILD_TLS=yes - name: test run: | sudo apt-get install tcl8.6 tclx @@ -33,7 +35,9 @@ jobs: - uses: actions/checkout@v6 - name: make # build with TLS module just for compilation coverage - run: make SANITIZER=address REDIS_CFLAGS='-Werror -DDEBUG_ASSERTIONS -DREDIS_TEST' BUILD_TLS=module + run: | + sudo apt-get install -y libzstd-dev + make SANITIZER=address REDIS_CFLAGS='-Werror -DDEBUG_ASSERTIONS -DREDIS_TEST' BUILD_TLS=module - name: testprep run: sudo apt-get install tcl8.6 tclx -y - name: test @@ -48,7 +52,8 @@ jobs: run: | sed -i 's|http://deb.debian.org/debian|http://archive.debian.org/debian|g' /etc/apt/sources.list sed -i 's|http://security.debian.org|http://archive.debian.org/debian-security|g' /etc/apt/sources.list - apt-get update && apt-get install -y build-essential + echo "deb http://archive.debian.org/debian buster-backports main" >> /etc/apt/sources.list + apt-get update && apt-get install -y build-essential libzstd-dev/buster-backports make REDIS_CFLAGS='-Werror' build-macos-latest: @@ -58,7 +63,9 @@ jobs: - name: make # Fail build if there are warnings # build with TLS just for compilation coverage - run: make REDIS_CFLAGS='-Werror' BUILD_TLS=yes + run: | + brew install zstd + make REDIS_CFLAGS='-Werror' BUILD_TLS=yes build-32bit: runs-on: ubuntu-latest @@ -66,7 +73,8 @@ jobs: - uses: actions/checkout@v6 - name: make run: | - sudo apt-get update && sudo apt-get install libc6-dev-i386 gcc-multilib + sudo dpkg --add-architecture i386 + sudo apt-get update && sudo apt-get install libc6-dev-i386 gcc-multilib libzstd-dev:i386 make REDIS_CFLAGS='-Werror' 32bit build-libc-malloc: @@ -74,7 +82,9 @@ jobs: steps: - uses: actions/checkout@v6 - name: make - run: make REDIS_CFLAGS='-Werror' MALLOC=libc + run: | + sudo apt-get install -y libzstd-dev + make REDIS_CFLAGS='-Werror' MALLOC=libc build-centos-jemalloc: runs-on: ubuntu-latest @@ -83,7 +93,7 @@ jobs: - uses: actions/checkout@v6 - name: make run: | - dnf -y install which gcc make + dnf -y install which gcc make libzstd-devel make REDIS_CFLAGS='-Werror' build-old-chain-jemalloc: @@ -100,6 +110,6 @@ jobs: apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 40976EAF437D05B5 apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 3B4FE6ACC0B21F32 apt-get update - apt-get install -y make gcc-4.8 + apt-get install -y make gcc-4.8 libzstd-dev update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 100 make CC=gcc REDIS_CFLAGS='-Werror' diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index fd067686c1b..5b0e102b0d1 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -11,7 +11,7 @@ on: inputs: skipjobs: description: 'jobs to skip (delete the ones you wanna keep, do not leave empty)' - default: 'valgrind,sanitizer,tls,freebsd,macos,alpine,32bit,iothreads,ubuntu,centos,malloc,specific,fortify,reply-schema,oldTC,defrag,vectorset,assert-keyspace,arm' + default: 'valgrind,sanitizer,tls,freebsd,macos,alpine,32bit,iothreads,ubuntu,centos,malloc,specific,fortify,reply-schema,oldTC,defrag,vectorset,assert-keyspace,arm,compression' skiptests: description: 'tests to skip (delete the ones you wanna keep, do not leave empty)' default: 'redis,modules,sentinel,cluster,unittest' @@ -52,7 +52,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make REDIS_CFLAGS='-Werror -DREDIS_TEST -DDEBUG_ASSERTIONS' + run: | + sudo apt-get install -y libzstd-dev + make REDIS_CFLAGS='-Werror -DREDIS_TEST -DDEBUG_ASSERTIONS' - name: testprep run: sudo apt-get install tcl8.6 tclx - name: test @@ -240,7 +242,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - apt-get update && apt-get install -y make gcc + apt-get update && apt-get install -y make gcc libzstd-dev # Also enables jemalloc's sized deallocation checks to catch sdallocx()/zfree_with_size() misuse. make CC=gcc REDIS_CFLAGS='-Werror -DREDIS_TEST -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3' JEMALLOC_CONFIGURE_OPTS='--enable-opt-size-checks' - name: testprep @@ -279,7 +281,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make MALLOC=libc REDIS_CFLAGS='-Werror' + run: | + sudo apt-get install -y libzstd-dev + make MALLOC=libc REDIS_CFLAGS='-Werror' - name: testprep run: sudo apt-get install tcl8.6 tclx - name: test @@ -313,7 +317,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make MALLOC=libc CFLAGS=-DNO_MALLOC_USABLE_SIZE REDIS_CFLAGS='-Werror' + run: | + sudo apt-get install -y libzstd-dev + make MALLOC=libc CFLAGS=-DNO_MALLOC_USABLE_SIZE REDIS_CFLAGS='-Werror' - name: testprep run: sudo apt-get install tcl8.6 tclx - name: test @@ -348,7 +354,8 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - sudo apt-get update && sudo apt-get install libc6-dev-i386 gcc-multilib + sudo dpkg --add-architecture i386 + sudo apt-get update && sudo apt-get install libc6-dev-i386 gcc-multilib libzstd-dev:i386 make 32bit REDIS_CFLAGS='-Werror -DREDIS_TEST' make -C tests/modules 32bit # the script below doesn't have an argument, we must build manually ahead of time - name: testprep @@ -388,6 +395,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | + sudo apt-get install -y libzstd-dev make BUILD_TLS=yes REDIS_CFLAGS='-Werror' - name: testprep run: | @@ -428,6 +436,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | + sudo apt-get install -y libzstd-dev make BUILD_TLS=yes REDIS_CFLAGS='-Werror' - name: testprep run: | @@ -468,6 +477,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | + sudo apt-get install -y libzstd-dev make REDIS_CFLAGS='-Werror' - name: testprep run: sudo apt-get install tcl8.6 tclx @@ -478,6 +488,43 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster --config io-threads 4 ${{github.event.inputs.cluster_test_args}} + test-ubuntu-client-compression: + runs-on: ubuntu-latest + if: | + (github.event_name == 'workflow_dispatch' || (github.event_name != 'workflow_dispatch' && github.repository == 'redis/redis')) && + !contains(github.event.inputs.skipjobs, 'compression') + timeout-minutes: 14400 + steps: + - name: prep + if: github.event_name == 'workflow_dispatch' + run: | + echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV + echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV + echo "skipjobs: ${{github.event.inputs.skipjobs}}" + echo "skiptests: ${{github.event.inputs.skiptests}}" + echo "test_args: ${{github.event.inputs.test_args}}" + echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" + - uses: actions/checkout@v4 + with: + repository: ${{ env.GITHUB_REPOSITORY }} + ref: ${{ env.GITHUB_HEAD_REF }} + - name: make + run: | + sudo apt-get install -y libzstd-dev + make REDIS_CFLAGS='-Werror' + - name: testprep + run: sudo apt-get install tcl8.6 tclx + - name: test + if: true && !contains(github.event.inputs.skiptests, 'redis') + # Running with less than default(16) clients as compression slows down + # replication and some tests start to fail because of timing errors. + # This is similar to why we run tsan with only 1 client which is even + # more extreme case + run: ./runtest --clients 4 --config io-threads 4 --config repl-compression 1 --compression --accurate --verbose --tags "repl network iothreads psync2" --dump-logs ${{github.event.inputs.test_args}} + - name: cluster tests + if: true && !contains(github.event.inputs.skiptests, 'cluster') + run: ./runtest-cluster --config io-threads 4 --config repl-compression 1 --compression ${{github.event.inputs.cluster_test_args}} + test-ubuntu-reclaim-cache: runs-on: ubuntu-latest if: | @@ -500,6 +547,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | + sudo apt-get install -y libzstd-dev make REDIS_CFLAGS='-Werror' - name: testprep run: | @@ -577,7 +625,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind REDIS_CFLAGS='-Werror -DREDIS_TEST' + run: | + sudo apt-get update && sudo apt-get install -y libzstd-dev + make valgrind REDIS_CFLAGS='-Werror -DREDIS_TEST' - name: testprep run: | sudo apt-get update @@ -610,7 +660,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind REDIS_CFLAGS='-Werror -DREDIS_TEST' + run: | + sudo apt-get update && sudo apt-get install -y libzstd-dev + make valgrind REDIS_CFLAGS='-Werror -DREDIS_TEST' - name: testprep run: | sudo apt-get update @@ -642,7 +694,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE -DREDIS_TEST" REDIS_CFLAGS='-Werror' + run: | + sudo apt-get update && sudo apt-get install -y libzstd-dev + make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE -DREDIS_TEST" REDIS_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -672,7 +726,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE -DREDIS_TEST" REDIS_CFLAGS='-Werror' + run: | + sudo apt-get update && sudo apt-get install -y libzstd-dev + make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE -DREDIS_TEST" REDIS_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -709,7 +765,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make SANITIZER=address REDIS_CFLAGS='-DREDIS_TEST -Werror -DDEBUG_ASSERTIONS' + run: | + sudo apt-get install -y libzstd-dev + make SANITIZER=address REDIS_CFLAGS='-DREDIS_TEST -Werror -DDEBUG_ASSERTIONS' - name: testprep run: | sudo apt-get update @@ -750,7 +808,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make SANITIZER=memory REDIS_CFLAGS='-DREDIS_TEST -Werror -DDEBUG_ASSERTIONS' + run: | + sudo apt-get install -y libzstd-dev + make SANITIZER=memory REDIS_CFLAGS='-DREDIS_TEST -Werror -DDEBUG_ASSERTIONS' - name: testprep run: | sudo apt-get update @@ -794,7 +854,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make SANITIZER=undefined REDIS_CFLAGS='-DREDIS_TEST -Werror' SKIP_VEC_SETS=yes LUA_DEBUG=yes # we (ab)use this flow to also check Lua C API violations + run: | + sudo apt-get install -y libzstd-dev + make SANITIZER=undefined REDIS_CFLAGS='-DREDIS_TEST -Werror' SKIP_VEC_SETS=yes LUA_DEBUG=yes # we (ab)use this flow to also check Lua C API violations - name: testprep run: | sudo apt-get update @@ -841,7 +903,9 @@ jobs: - name: make # TODO Investigate why jemalloc with clang TSan crash on start; # with gcc TSan, jemalloc works modulo sentinel tests hanging. - run: make SANITIZER=thread USE_JEMALLOC=no REDIS_CFLAGS='-DREDIS_TEST -Werror -DDEBUG_ASSERTIONS' + run: | + sudo apt-get install -y libzstd-dev + make SANITIZER=thread USE_JEMALLOC=no REDIS_CFLAGS='-DREDIS_TEST -Werror -DDEBUG_ASSERTIONS' - name: testprep run: | sudo apt-get update @@ -879,7 +943,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - dnf -y install which gcc make + dnf -y install which gcc make libzstd-devel make REDIS_CFLAGS='-Werror' - name: testprep run: | @@ -918,7 +982,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - dnf -y install which gcc make openssl-devel openssl + dnf -y install which gcc make openssl-devel openssl libzstd-devel make BUILD_TLS=module REDIS_CFLAGS='-Werror' - name: testprep run: | @@ -961,7 +1025,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - dnf -y install which gcc make openssl-devel openssl + dnf -y install which gcc make openssl-devel openssl libzstd-devel make BUILD_TLS=module REDIS_CFLAGS='-Werror' - name: testprep run: | @@ -1002,7 +1066,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make REDIS_CFLAGS='-Werror' + run: | + brew install zstd + make REDIS_CFLAGS='-Werror' - name: test if: true && !contains(github.event.inputs.skiptests, 'redis') run: ./runtest --accurate --verbose --clients 1 --no-latency --dump-logs ${{github.event.inputs.test_args}} @@ -1028,7 +1094,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make REDIS_CFLAGS='-Werror' + run: | + brew install zstd + make REDIS_CFLAGS='-Werror' - name: sentinel tests if: true && !contains(github.event.inputs.skiptests, 'sentinel') run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} @@ -1054,7 +1122,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make REDIS_CFLAGS='-Werror' + run: | + brew install zstd + make REDIS_CFLAGS='-Werror' - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} @@ -1086,7 +1156,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make REDIS_CFLAGS='-Werror -DREDIS_TEST' + run: | + brew install zstd + make REDIS_CFLAGS='-Werror -DREDIS_TEST' test-freebsd: runs-on: ubuntu-latest @@ -1112,7 +1184,7 @@ jobs: version: 13.2 shell: bash run: | - sudo pkg install -y bash gmake lang/tcl86 lang/tclX gcc + sudo pkg install -y bash gmake lang/tcl86 lang/tclX gcc zstd devel/pkgconf gmake ./runtest --single unit/keyspace --single unit/auth --single unit/networking --single unit/protocol @@ -1138,7 +1210,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - apk add build-base + apk add build-base zstd-dev make REDIS_CFLAGS='-Werror' - name: testprep run: apk add tcl procps tclx @@ -1174,7 +1246,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - apk add build-base + apk add build-base zstd-dev make REDIS_CFLAGS='-Werror' USE_JEMALLOC=no CFLAGS=-DUSE_MALLOC_USABLE_SIZE - name: testprep run: apk add tcl procps tclx @@ -1209,7 +1281,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make REDIS_CFLAGS='-Werror -DLOG_REQ_RES' + run: | + sudo apt-get install -y libzstd-dev + make REDIS_CFLAGS='-Werror -DLOG_REQ_RES' - name: testprep run: sudo apt-get install tcl8.6 tclx - name: test @@ -1262,7 +1336,7 @@ jobs: apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 40976EAF437D05B5 apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 3B4FE6ACC0B21F32 apt-get update - apt-get install -y make gcc-4.8 + apt-get install -y make gcc-4.8 libzstd-dev update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 100 make CC=gcc REDIS_CFLAGS='-Werror' - name: testprep @@ -1307,7 +1381,7 @@ jobs: apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 40976EAF437D05B5 apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 3B4FE6ACC0B21F32 apt-get update - apt-get install -y make gcc-4.8 openssl libssl-dev + apt-get install -y make gcc-4.8 openssl libssl-dev libzstd-dev update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 100 make CC=gcc BUILD_TLS=module REDIS_CFLAGS='-Werror' - name: testprep @@ -1357,7 +1431,7 @@ jobs: apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 40976EAF437D05B5 apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 3B4FE6ACC0B21F32 apt-get update - apt-get install -y make gcc-4.8 openssl libssl-dev + apt-get install -y make gcc-4.8 openssl libssl-dev libzstd-dev update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 100 make BUILD_TLS=module CC=gcc REDIS_CFLAGS='-Werror' - name: testprep @@ -1398,7 +1472,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make SANITIZER=address DEBUG_DEFRAG=force REDIS_CFLAGS='-Werror' + run: | + sudo apt-get install -y libzstd-dev + make SANITIZER=address DEBUG_DEFRAG=force REDIS_CFLAGS='-Werror' - name: testprep run: sudo apt-get install tcl8.6 tclx - name: test @@ -1426,7 +1502,9 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make REDIS_CFLAGS='-Werror -DREDIS_TEST' + run: | + sudo apt-get install -y libzstd-dev + make REDIS_CFLAGS='-Werror -DREDIS_TEST' - name: testprep run: | sudo apt-get install tcl8.6 tclx diff --git a/redis.conf b/redis.conf index 9151c8fc8e5..7077ac2a112 100644 --- a/redis.conf +++ b/redis.conf @@ -839,6 +839,16 @@ replica-priority 100 # By default min-replicas-to-write is set to 0 (feature disabled) and # min-replicas-max-lag is set to 10. +# Enable client compression for replication at specified level when connecting to +# master 0-22 (0 for disabled compression). Currently only zstd compression lib +# is supported hence the compression values 1-22 are related to it. +# NOTE: This setting only works when io-threads > 1 +repl-compression 0 + +# When compression is enabled specify maximum latency in ms before the +# compressed data is flushed and send to the socket. +compression-max-latency 100 + # A Redis master is able to list the address and port of the attached # replicas in different ways. For example the "INFO replication" section # offers this information, which is used, among other tools, by diff --git a/src/Makefile b/src/Makefile index 8939f89e07c..4c04d6f95e1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -152,6 +152,18 @@ endif FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(REDIS_CFLAGS) FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(REDIS_LDFLAGS) $(DEBUG) FINAL_LIBS=-lm + +# Detect libzstd via pkg-config, fall back to -lzstd +LIBZSTD_PKGCONFIG := $(shell $(PKG_CONFIG) --exists libzstd && echo $$?) +ifeq ($(LIBZSTD_PKGCONFIG),0) + LIBZSTD_CFLAGS=$(shell $(PKG_CONFIG) --cflags libzstd) + LIBZSTD_LIBS=$(shell $(PKG_CONFIG) --libs libzstd) +else + LIBZSTD_CFLAGS= + LIBZSTD_LIBS=-lzstd +endif +FINAL_CFLAGS+=$(LIBZSTD_CFLAGS) +FINAL_LIBS+=$(LIBZSTD_LIBS) DEBUG=-g -ggdb # Linux ARM32 needs -latomic at linking time @@ -384,7 +396,7 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=threads_mngr.o memory_prefetch.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o eventnotifier.o iothread.o mstr.o entry.o kvstore.o fwtree.o estore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_array.o sparsearray.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_asm.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o lolwut8.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o keymeta.o chk.o hotkeys.o gcra.o vector.o fast_float_strtod.o +REDIS_SERVER_OBJ=threads_mngr.o memory_prefetch.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o eventnotifier.o iothread.o mstr.o entry.o kvstore.o fwtree.o estore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o t_array.o sparsearray.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_asm.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o lolwut8.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o keymeta.o chk.o hotkeys.o gcra.o vector.o fast_float_strtod.o client_comp.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) diff --git a/src/ae.h b/src/ae.h index 996d48b0f83..4d6de8049a9 100644 --- a/src/ae.h +++ b/src/ae.h @@ -89,7 +89,7 @@ typedef struct aeEventLoop { aeBeforeSleepProc *beforesleep; aeBeforeSleepProc *aftersleep; int flags; - void *privdata[2]; + void *privdata[3]; } aeEventLoop; /* Prototypes */ diff --git a/src/client_comp.c b/src/client_comp.c new file mode 100644 index 00000000000..6cf8ae0d5fb --- /dev/null +++ b/src/client_comp.c @@ -0,0 +1,919 @@ +#include "server.h" +#include "client_comp.h" +#include + +/* Abstraction over compression library */ +typedef struct compressionType { + int (*init_compress)(struct compressionState *st, int level); + int (*init_decompress)(struct compressionState *st); + int (*compress)(struct compressionState *st, int flush); + int (*decompress)(struct compressionState *st); + void (*end)(struct compressionState *st); +} compressionType; + +/* Temporary buffer used by compression library to store compressed/decompressed + * data. */ +typedef struct { + unsigned char *data; + int size; + int written; + int consumed; +} tempBuf; + +/* Main compression state struct */ +struct compressionState { + const compressionType *type; + tempBuf input; /* Buffer holding compressed data */ + tempBuf output; /* Buffer holding uncompressed(decompressed) data */ + union { + ZSTD_CStream *zstdCCtx; /* Zstd compression ctx */ + ZSTD_DStream *zstdDCtx; /* Zstd decompression ctx */ + } ctx; + int write_flush_pending; /* write flush not yet completed */ + int read_flush_pending; /* read flush not yet completed */ + mstime_t last_write; /* Time since last write. Used to check if it's time + * to flush the buffer */ + compressionDirection dir; +}; + +/* --- zstd --- */ + +static int zstdInitCompress(compressionState *st, int level) { + st->ctx.zstdCCtx = ZSTD_createCStream(); + if (!st->ctx.zstdCCtx) { + serverLog(LL_NOTICE, "Failed to create ZSTD compression context"); + return -1; + } + size_t res = ZSTD_CCtx_setParameter(st->ctx.zstdCCtx, ZSTD_c_compressionLevel, level); + if (ZSTD_isError(res)) { + ZSTD_freeCStream(st->ctx.zstdCCtx); + serverLog(LL_NOTICE, "Failed to set compression level for ZSTD compression context"); + return -1; + } + + /* temp buf storing compressed data */ + size_t outSize = ZSTD_CStreamOutSize(); + st->output.data = zmalloc(outSize); + st->output.size = outSize; + st->output.written = 0; + st->output.consumed = 0; + + /* temp buf storing uncompressed data */ + size_t inSize = ZSTD_CStreamInSize(); + st->input.data = zmalloc(inSize); + st->input.size = inSize; + st->input.written = 0; + st->input.consumed = 0; + + st->write_flush_pending = 0; + + return 0; +} + +static int zstdInitDecompress(compressionState *st) { + st->ctx.zstdDCtx = ZSTD_createDStream(); + if (!st->ctx.zstdDCtx) { + serverLog(LL_NOTICE, "Failed to create ZSTD decompression context"); + return -1; + } + + /* temp buf storing compressed data */ + size_t inSize = ZSTD_DStreamInSize(); + st->input.data = zmalloc(inSize); + st->input.size = inSize; + st->input.written = 0; + st->input.consumed = 0; + + /* temp buf storing decompressed data */ + size_t outSize = ZSTD_DStreamOutSize(); + st->output.data = zmalloc(outSize); + st->output.size = outSize; + st->output.written = 0; + st->output.consumed = 0; + + st->read_flush_pending = 0; + + return 0; +} + +static int zstdCompress(compressionState *st, int flush) { + ZSTD_inBuffer input = { + .src = st->input.data, + .size = st->input.written, + .pos = st->input.consumed + }; + ZSTD_outBuffer output = { + .dst = st->output.data, + .size = st->output.size, + .pos = st->output.written + }; + + ZSTD_EndDirective directive; + /* We use ZSTD_e_end instead of ZSTD_e_flush when we want to flush zstd's. + * This flushes zstd's internal buffers but also ends the current frame. + * This of course lowers the compression ratio but massively increases speed + * on the decompression side also, as it doesn't need to wait for more data. + * The resulting compression ratio is still very good (tested with default + * compression level). */ + if (flush || st->write_flush_pending) + directive = ZSTD_e_end; + else + directive = ZSTD_e_continue; + + size_t ret; + do { + ret = ZSTD_compressStream2(st->ctx.zstdCCtx, &output, &input, directive); + if (ZSTD_isError(ret)) { + serverLog(LL_WARNING, "zstd compress error: %s", ZSTD_getErrorName(ret)); + return -1; + } + } while (ret > 0 && output.pos < output.size); + + /* If we pass a directive different than ZSTD_e_continue to zstd we want to + * keep using that directive until compressStream2 returns 0. By keeping + * this flag raised we know we are in the process of flushing data, i.e we + * cannot use ZSTD_e_continue before we have flushed it all. */ + st->write_flush_pending = (directive == ZSTD_e_end && ret > 0); + + st->input.consumed = input.pos; + st->output.written = output.pos; + + return 0; +} + +static int zstdDecompress(compressionState *st) { + ZSTD_inBuffer input = { + .src = st->input.data, + .size = st->input.written, + .pos = st->input.consumed + }; + ZSTD_outBuffer output = { + .dst = st->output.data, + .size = st->output.size, + .pos = st->output.written + }; + + size_t ret = ZSTD_decompressStream(st->ctx.zstdDCtx, &output, &input); + if (ZSTD_isError(ret)) { + serverLog(LL_NOTICE, "zstd decompress error: %s", ZSTD_getErrorName(ret)); + return -1; + } + + /* Don't try to flush again if we already tried, no more progress can be + * made without additional input. */ + st->read_flush_pending = !st->read_flush_pending && (ret > 0); + + st->input.consumed = input.pos; + st->output.written = output.pos; + + return 0; +} + +static void zstdEnd(compressionState *st) { + if (st->dir == COMPRESS && st->ctx.zstdCCtx) { + /* Flush any pending data so context is in a good state before closing it */ + if (st->write_flush_pending) { + size_t sz = ZSTD_CStreamOutSize(); + char *tmp = zmalloc(sz); + ZSTD_inBuffer input = { + .src = NULL, + .size = 0, + .pos = 0 + }; + ZSTD_outBuffer output = { + .dst = tmp, + .size = sz, + .pos = 0 + }; + while (ZSTD_compressStream2(st->ctx.zstdCCtx, &output, &input, ZSTD_e_end) > 0) { + /* Just ignore the output, we are closing the compression state + * anyways */ + output.pos = 0; + } + } + ZSTD_freeCStream(st->ctx.zstdCCtx); + st->ctx.zstdCCtx = NULL; + } else if (st->dir == DECOMPRESS && st->ctx.zstdDCtx) { + ZSTD_freeDStream(st->ctx.zstdDCtx); + st->ctx.zstdDCtx = NULL; + } +} + +static const compressionType zstdType = { + .init_compress = zstdInitCompress, + .init_decompress = zstdInitDecompress, + .compress = zstdCompress, + .decompress = zstdDecompress, + .end = zstdEnd, +}; + +/* Compression connection. It wraps over existing connection in order to add + * compression capability during read/write operations. The `base` member is only + * used for getting the ConnectionType but whenever we do any operation on the + * connection we use the underlying pointer which is the actual connection. + * See CT_Compression. */ +typedef struct compressionConnection { + connection base; + connection *underlying; + listNode *pending_data_node; + size_t last_read; + size_t last_written; + int handle_pending; +} compressionConnection; + +int decompressInto(compressionState *state, char *buf, size_t buflen); + +static void compressionPendingAdd(compressionConnection *cc) { + if (cc->pending_data_node) return; + + if (!cc->underlying->el->privdata[2]) { + cc->underlying->el->privdata[2] = listCreate(); + } + + list *l = cc->underlying->el->privdata[2]; + listAddNodeTail(l, cc); + cc->pending_data_node = listLast(l); +} + +static void compressionPendingRemove(compressionConnection *cc) { + if (!cc->pending_data_node) return; + + list *l = cc->underlying->el->privdata[2]; + if (l && listLength(l) > 0 && listSearchKey(l, cc) == cc->pending_data_node) { + listDelNode(l, cc->pending_data_node); + } else if (cc->pending_data_node) { + zfree(cc->pending_data_node); + } + cc->pending_data_node = NULL; +} + +static const char *connCompressionGetType(connection *conn) { + UNUSED(conn); + return CONN_TYPE_COMPRESSION; +} + +static void connCompressionAeHandler(aeEventLoop *el, int fd, void *clientData, int mask) { + compressionConnection *cc = (compressionConnection *)clientData; + cc->underlying->type->ae_handler(el, fd, clientData, mask); +} + +static int connCompressionAddr(connection *conn, char *ip, size_t ip_len, int *port, int remote) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->addr(cc->underlying, ip, ip_len, port, remote); +} + +static int connCompressionIsLocal(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->is_local(cc->underlying); +} + +static void connCompressionShutdown(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + cc->underlying->type->shutdown(cc->underlying); +} + +static void connCompressionClose(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + cc->underlying->type->close(cc->underlying); + zfree(conn); +} + +static int connCompressionConnect(connection *conn, const char *addr, int port, const char *source_addr, ConnectionCallbackFunc connect_handler) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->connect(cc->underlying, addr, port, source_addr, connect_handler); +} + +static int connCompressionBlockingConnect(connection *conn, const char *addr, int port, long long timeout) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->blocking_connect(cc->underlying, addr, port, timeout); +} + +static int connCompressionAccept(connection *conn, ConnectionCallbackFunc accept_handler) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->accept(cc->underlying, accept_handler); +} + +/* If decompression is initialized for a connection it reads compressed data + * from the underlying connection (i.e usually from socket) into a temp buffer + * and decompresses into the passed `buf`. Tries to read and decompress as much + * as possible. + * There are some scenarios though that we may still have pending data to process + * (see clientHasPendingCompressedData). In such cases the connection is added + * to the event-loop's pending data and processed via connProcessPendingData. */ +static int connCompressionRead(struct connection *conn, void *buf, size_t buf_len) { + compressionConnection *cc = (compressionConnection*)conn; + client *c = connGetPrivateData(conn); + compressionState *state = c->compression_state; + if (!state || state->dir != DECOMPRESS) + return connRead(cc->underlying, buf, buf_len); + + cc->last_read = 0; + size_t decompressed = 0; + do { + int curr = decompressInto(state, (char*)buf + decompressed, buf_len - decompressed); + /* Decompression error, we should close the connection */ + if (curr < 0) { + cc->underlying->state = CONN_STATE_CLOSED; + break; + } + decompressed += curr; + + int nread = 0; + /* If the handle_pending flag is raised we only decompress whatever data + * we have read from the socket without reading anything more. Socket + * reading will happened when the event loop handles read event in which + * case the handle_pending flags wouldn't be raised. */ + if (!cc->handle_pending) { + nread = cc->underlying->type->read( + cc->underlying, + state->input.data + state->input.written, + state->input.size - state->input.written); + + if (nread < 0 && connGetState(cc->underlying) == CONN_STATE_ERROR) { + cc->last_read = -1; + return -1; + } + /* Even if nread == 0 we continue the loop until decompressInto has + * nothing more it can do. */ + if (nread > 0) { + cc->last_read += nread; + state->input.written += nread; + } + } + + if (curr <= 0 && nread <= 0) break; + } while (decompressed <= buf_len); + + if (decompressed == 0 && connGetState(cc->underlying) == CONN_STATE_CONNECTED) + return -1; + + /* No need to give pending work to main thread as the client must be send to + * IO-thread soon enough. */ + if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { + if (connGetState(conn) == CONN_STATE_CONNECTED && clientHasPendingCompressedData(c)) { + compressionPendingAdd(cc); + } else if (cc->pending_data_node) { + compressionPendingRemove(cc); + } + } + + return decompressed; +} + +/* Compress all bytes from `data` and pass the compressed data to the underlying's + * connection write method. */ +static int connCompressionWrite(connection *conn, const void *data, size_t len) { + compressionConnection *cc = (compressionConnection*)conn; + client *c = connGetPrivateData(conn); + compressionState *state = c->compression_state; + /* If no compression state or we are not compressing data - defer to + * underlying connection. */ + if (!state || state->dir != COMPRESS) + return connWrite(cc->underlying, data, len); + + int consumed = 0; + cc->last_written = 0; + while ((size_t)consumed != len) { + int to_consume = + min(state->input.size - state->input.written, (int)(len - consumed)); + serverAssert(to_consume >= 0); + + memcpy(state->input.data + state->input.written, + (char*)data + consumed, to_consume); + + state->input.written += to_consume; + consumed += to_consume; + + /* Write whatever we have available in the compressed buffer */ + int written = 0; + int err = compressAndWrite(c, &written); + cc->last_written += written; + if (err) { + if (connGetState(c->conn) != CONN_STATE_CONNECTED) { + return -1; + } + return consumed; + } + + if (written == 0 && state->output.written == state->output.consumed) + break; + } + + return consumed; +} + +static int connCompressionWritev(connection *conn, const struct iovec *iov, int iovcnt) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->writev(cc->underlying, iov, iovcnt); +} + +static int connCompressionSetWriteHandler(connection *conn, ConnectionCallbackFunc handler, int barrier) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->set_write_handler(cc->underlying, handler, barrier); +} + +static int connCompressionSetReadHandler(connection *conn, ConnectionCallbackFunc handler) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->set_read_handler(cc->underlying, handler); +} + +static const char *connCompressionGetLastError(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->get_last_error(cc->underlying); +} + +static ssize_t connCompressionSyncWrite(connection *conn, char *ptr, ssize_t size, long long timeout) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->sync_write(cc->underlying, ptr, size, timeout); +} + +static ssize_t connCompressionSyncRead(connection *conn, char *ptr, ssize_t size, long long timeout) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->sync_read(cc->underlying, ptr, size, timeout); +} + +static ssize_t connCompressionSyncReadLine(connection *conn, char *ptr, ssize_t size, long long timeout) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->sync_readline(cc->underlying, ptr, size, timeout); +} + +static size_t connCompressionGetLastRead(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->last_read; +} + +static size_t connCompressionGetLastWritten(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->last_written; +} + +static void connCompressionUnbindEventLoop(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + aeEventLoop *el = cc->underlying->el; + if (el) { + int fd = cc->underlying->fd; + int mask = aeGetFileEvents(el, cc->underlying->fd); + if (mask & AE_READABLE) aeDeleteFileEvent(el, fd, AE_READABLE); + if (mask & AE_WRITABLE) aeDeleteFileEvent(el, fd, AE_WRITABLE); + + if (cc->pending_data_node) { + compressionPendingRemove(cc); + } + } +} + +static int connCompressionRebindEventLoop(connection *conn, aeEventLoop *el) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->type->rebind_event_loop(cc->underlying, el); +} + +static aeEventLoop *connCompressionGetEventLoop(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->el; +} + +static void connCompressionUnsetEventLoop(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + cc->underlying->el = NULL; +} + +static int connCompressionGetFd(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->fd; +} + +static int connCompressionGetIovcnt(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->iovcnt; +} + +static ConnectionState connCompressionGetState(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->state; +} + +static int connCompressionGetLastErrno(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->last_errno; +} + +static int connCompressionHasReadHandler(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->read_handler != NULL; +} + +static int connCompressionHasWriteHandler(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->write_handler != NULL; +} + +static void connCompressionSetPrivateData(connection *conn, void *data) { + compressionConnection *cc = (compressionConnection*)conn; + cc->underlying->private_data = data; +} + +static void *connCompressionGetPrivateData(connection *conn) { + compressionConnection *cc = (compressionConnection*)conn; + return cc->underlying->private_data; +} + +static int compressionHasPendingData(struct aeEventLoop *el) { + list *pending_list = el->privdata[2]; + if (!pending_list) + return 0; + return listLength(pending_list) > 0; +} + +/* Handling pending data involves calling the read handler of the underlying + * connection. We don't actually want to call connRead on it as we handle only + * pending data(i.e internal buffers of the compression library not yet flushed + * or non-consumed decompressed data in our temp buffers) and not any actual + * read events. */ +static int compressionProcessPendingData(struct aeEventLoop *el) { + list *pending_list = el->privdata[2]; + if (!pending_list || listLength(pending_list) == 0) + return 0; + + listIter li; + listNode *ln; + int processed = 0; + listRewind(pending_list,&li); + while((ln = listNext(&li))) { + compressionConnection *cc = listNodeValue(ln); + if (!cc || !cc->underlying || !connHasReadHandler((connection*)cc)) continue; + + cc->handle_pending = 1; + cc->underlying->read_handler(cc->underlying); + cc->handle_pending = 0; + + ++processed; + } + return processed; +} + +/* ConnectionType for the compression connection */ +static ConnectionType CT_Compression = { + /* connection type */ + .get_type = connCompressionGetType, + + /* ae & accept & listen & error & address handler */ + .ae_handler = connCompressionAeHandler, + .addr = connCompressionAddr, + .is_local = connCompressionIsLocal, + + /* shutdown/close connection */ + .shutdown = connCompressionShutdown, + .close = connCompressionClose, + + /* connect & accept */ + .connect = connCompressionConnect, + .blocking_connect = connCompressionBlockingConnect, + .accept = connCompressionAccept, + + /* event loop */ + .unbind_event_loop = connCompressionUnbindEventLoop, + .rebind_event_loop = connCompressionRebindEventLoop, + + /* IO */ + .read = connCompressionRead, + .write = connCompressionWrite, + .writev = connCompressionWritev, + .set_write_handler = connCompressionSetWriteHandler, + .set_read_handler = connCompressionSetReadHandler, + .get_last_error = connCompressionGetLastError, + .sync_write = connCompressionSyncWrite, + .sync_read = connCompressionSyncRead, + .sync_readline = connCompressionSyncReadLine, + .get_last_read = connCompressionGetLastRead, + .get_last_written = connCompressionGetLastWritten, + + /* pending data */ + .has_pending_data = compressionHasPendingData, + .process_pending_data = compressionProcessPendingData, + + .get_peer_cert = NULL, + .get_peer_username = NULL, + + /* connection accessors */ + .get_event_loop = connCompressionGetEventLoop, + .unset_event_loop = connCompressionUnsetEventLoop, + .get_fd = connCompressionGetFd, + .get_iovcnt = connCompressionGetIovcnt, + .get_state = connCompressionGetState, + .get_last_errno = connCompressionGetLastErrno, + .has_read_handler = connCompressionHasReadHandler, + .has_write_handler = connCompressionHasWriteHandler, + .set_private_data = connCompressionSetPrivateData, + .get_private_data = connCompressionGetPrivateData, +}; + +int RedisRegisterConnectionTypeCompression(void) { + return connTypeRegister(&CT_Compression); +} + +/* Create compression state for the client */ +int compressionStateCreate(client *c) { + compressionState *st = zmalloc(sizeof(compressionState)); + st->type = &zstdType; + st->last_write = 0; + st->write_flush_pending = 0; + st->read_flush_pending = 0; + st->dir = CD_INVALID; + + c->compression_state = st; + + return 1; +} + +void compressionStateDestroy(compressionState *state) { + if (state == NULL) return; + + state->type->end(state); + zfree(state->input.data); + zfree(state->output.data); + zfree(state); +} + +/* Create and initialize a compression state for the client. No-op if already + * initialized. `dir` indicates the compression direction, i.e if the client + * will compress or decompress data. + * Currently only viable for master/replica clients. */ +int clientCreateCompressionState(client *c, compressionDirection dir) { + /* Client compression already initialized */ + if (c->compression_state != NULL) + return 1; + + serverAssert(compressionStateCreate(c)); + + compressionState *st = c->compression_state; + + if (dir == COMPRESS) { + serverAssert(c->compression_level > 0 && c->flags & CLIENT_SLAVE); + + if (st->type->init_compress(st, c->compression_level) == -1) { + compressionStateDestroy(c->compression_state); + return 0; + } + + st->dir = COMPRESS; + + serverLog(LL_NOTICE, "Initialized compression at level %d for client #%llu...", + c->compression_level, (unsigned long long)c->id); + } else if (dir == DECOMPRESS) { + serverAssert(server.repl_master_compression_level > 0); + + if (st->type->init_decompress(st) == -1) { + compressionStateDestroy(c->compression_state); + return 0; + } + + st->dir = DECOMPRESS; + + serverLog(LL_NOTICE, "Decompression for master client initialized."); + } else { + /* Inaccessible */ + serverAssert(0); + } + + compressionConnection *cc = zmalloc(sizeof(compressionConnection)); + cc->base.type = &CT_Compression; + cc->underlying = c->conn; + cc->pending_data_node = NULL; + cc->last_read = 0; + cc->last_written = 0; + cc->handle_pending = 0; + c->conn = (connection*)cc; + + return 1; +} + +void clientDestroyCompressionState(client *c) { + if (c->compression_state == NULL) return; + + /* If the connection is not destroyed yet we need to switch back to the + * underlying connection. */ + if (c->conn) { + compressionConnection *cc = (compressionConnection*)c->conn; + c->conn = cc->underlying; + + zfree(cc); + } + + compressionStateDestroy(c->compression_state); + c->compression_state = NULL; + c->compression_level = 0; + c->io_flags &= ~CLIENT_IO_COMPRESSION_ENABLED; + + serverLog(LL_NOTICE, "Compression state for client #%llu%s destroyed...", + (unsigned long long)c->id, + c->flags & CLIENT_MASTER ? " (master)" : c->flags & CLIENT_SLAVE ? + " (slave)" : ""); +} + +/* Enable client compression and create compression state if not present. + * Currently only valid for primary/replica clients. + * Return 0 if compression state was not created and failed to be initialized, + * 1 if compression was enabled. */ +int clientEnableCompression(client *c, compressionDirection dir) { + serverAssert((c->flags & CLIENT_MASTER) || (c->flags & CLIENT_SLAVE)); + if (!clientCreateCompressionState(c, dir)) { + return 0; + } + + c->io_flags |= CLIENT_IO_COMPRESSION_ENABLED; + return 1; +} + +/* Disable compression without destroying compression state. */ +void clientDisableCompression(client *c) { + c->io_flags &= ~CLIENT_IO_COMPRESSION_ENABLED; +} + +/* Compress any data send for compression (see connCompressionWrite) and + * write to socket. Compression library may not return compressed data + * immediately so this call may not write anything to socket. + * Force flushes the compressed buffer according to compression_max_latency. + * Return number of bytes written to socket or -1 on socket write error. */ +int compressAndWrite(client *c, int *tot_written) { + if (c->compression_level <= 0) + return 0; + + compressionState *state = c->compression_state; + serverAssert(state); + + /* All available uncompressed data was consumed so we need to reset the + * uncompressed buffer */ + if (state->input.written == state->input.size && + state->input.consumed == state->input.size) + { + state->input.written = 0; + state->input.consumed = 0; + } + + if (state->output.written < state->output.size) { + /* Force flush after `compression_max_latency` ms have passed. + * Note, this only makes sense when we have enough space for compressing + * data. */ + int flush = mstime() - state->last_write > server.compression_max_latency; + if (state->type->compress(state, flush) == -1) { + clientDestroyCompressionState(c); + return 1; + } + } + + compressionConnection *cc = (compressionConnection*)c->conn; + + /* Try to write all the data available in the compressed buffer. */ + *tot_written = 0; + int towrite = + state->output.written - state->output.consumed; + do { + int written = connWrite( + cc->underlying, state->output.data + state->output.consumed, towrite); + if (written < 0) { + return 1; + } + + state->output.consumed += written; + *tot_written += written; + towrite -= written; + + /* All of the compressed data was send to the socket so we need to reset + * the compression buffer. */ + if (state->output.consumed == state->output.size) { + serverAssert(towrite == 0 && state->output.written == state->output.size); + + state->output.written = 0; + state->output.consumed = 0; + } + } while (towrite > 0); + + if (*tot_written > 0) + state->last_write = mstime(); + + return 0; +} + +/* Decompress input compressed data and put it in `buf`. If decompressed data + * is more than buflen this function must be called again so output data can + * be consumed. If buflen is sufficiently large this function will decompress + * as much data as possible. */ +int decompressInto(compressionState *state, char *buf, size_t buflen) { + if (buflen == 0) + return 0; + + int consumed = 0; + + /* Decompress as much data as possible */ + while ((size_t)consumed < buflen && + (state->read_flush_pending || + state->input.written > state->input.consumed || + state->output.written > state->output.consumed)) + { + /* Reset the decompressed buffer if all the available data is consumed */ + if (state->output.consumed == state->output.size) { + state->output.written = 0; + state->output.consumed = 0; + } + + if ((state->read_flush_pending && state->output.size > state->output.written) || + state->input.written > state->input.consumed) + { + if (state->type->decompress(state) == -1) { + return -1; + } + } + + /* Copy the decompressed data to the output buffer */ + if (state->output.written > state->output.consumed) { + size_t nonconsumed_decompressed = state->output.written - state->output.consumed; + int to_consume = min(buflen - consumed, nonconsumed_decompressed); + memcpy(buf + consumed, state->output.data + state->output.consumed, to_consume); + + state->output.consumed += to_consume; + consumed += to_consume; + } + } + + if (state->output.consumed == state->output.size) + { + state->output.written = 0; + state->output.consumed = 0; + } + + /* Reset the compressed buffer if we decompressed all the available data */ + if (state->input.consumed == state->input.size) { + state->input.written = 0; + state->input.consumed = 0; + } + + serverAssert((size_t)consumed <= buflen); + return consumed; +} + +/* Read data from input_buf and decompress it immediately. The result is written + * into output_buf. + * Return number of bytes decompressed. *consumed stores number of bytes consumed + * from input_buf. + * Note, that we may have enough compressed data inside input buf so that decompressing + * it will exceed output_len. The function must be ran in a loop until input_buf + * is fully consumed - so make sure to have free space in output_buf on each call. */ +int readFromBufAndDecompress(client *c, char *input_buf, size_t input_len, + char *output_buf, size_t output_len, size_t *consumed) +{ + compressionState *state = c->compression_state; + if (!state) { + return -1; + } + + int tot_decompressed = 0; + *consumed = 0; + while (*consumed <= input_len && (size_t)tot_decompressed < output_len) { + int to_consume = + min(state->input.size - state->input.written, + (int)(input_len - *consumed)); + + if (to_consume) + memcpy(state->input.data + state->input.written, + input_buf + *consumed, to_consume); + + *consumed += to_consume; + + state->input.written += to_consume; + + int decompressed = decompressInto(state, output_buf + tot_decompressed, + output_len - tot_decompressed); + if (decompressed <= 0) + break; + tot_decompressed += decompressed; + } + + return tot_decompressed; +} + +/* Check if we need to flush compressed data. Compression library may wait for + * a lot of compressed data before it finishes a frame and gives it back to user. + * While this gives the best compression ratio it introduces a lot of latency so + * we make a compromise and flush periodically. */ +int clientHasPendingCompressionFlush(client *c) { + compressionState *state = c->compression_state; + if (!state) return 0; + if (state->dir != COMPRESS) return 0; + + return state->write_flush_pending || (mstime() - state->last_write >= server.compression_max_latency); +} + +/* Check if we still have pending compressed data. This may mean that either the + * compression library still has data in it's internal buffers, we still have + * compressed data that needs to be consumed by the library or we have stored + * decompressed data that we still have not consumed. */ +int clientHasPendingCompressedData(client *c) { + compressionState *state = c->compression_state; + if (!state) return 0; + if (state->dir != DECOMPRESS) return 0; + + return (state->read_flush_pending && state->output.size > state->output.written) || + state->input.written > state->input.consumed || + state->output.written > state->output.consumed; +} + diff --git a/src/client_comp.h b/src/client_comp.h new file mode 100644 index 00000000000..a5a308fa937 --- /dev/null +++ b/src/client_comp.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2009-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +#ifndef __CLIENT_COMP_H +#define __CLIENT_COMP_H + +#include + +/* Opaque handle to client's compression state used internally by client_comp */ +typedef struct compressionState compressionState; + +struct client; + +typedef enum { + CD_INVALID, + COMPRESS, + DECOMPRESS, +} compressionDirection; + +int clientCreateCompressionState(struct client *c, compressionDirection dir); +void clientDestroyCompressionState(struct client *c); + +int clientEnableCompression(struct client *c, compressionDirection dir); + +void clientDisableCompression(struct client *c); + +int compressAndWrite(struct client *c, int *tot_written); + +int readFromBufAndDecompress(struct client *c, char *input_buf, size_t input_len, + char *output_buf, size_t output_len, + size_t *consumed); + +int clientHasPendingCompressionFlush(struct client *c); +int clientHasPendingCompressedData(struct client *c); + +#endif /* __CLIENT_COMP_H */ diff --git a/src/cluster_asm.c b/src/cluster_asm.c index ac94a5103c9..63d662026fe 100644 --- a/src/cluster_asm.c +++ b/src/cluster_asm.c @@ -1373,7 +1373,7 @@ void asmRdbChannelSyncWithSource(connection *conn) { * surface the error condition. * using shutdown() instead of connShutdown() because connTLSShutdown() * will free the connection directly, which is not what we want. */ - shutdown(conn->fd, SHUT_RDWR); + shutdown(connGetFd(conn), SHUT_RDWR); connRead(conn, buf, 1); } @@ -1524,7 +1524,7 @@ void asmSyncWithSource(connection *conn) { /* Check if the fail point is active for this channel and state */ if (unlikely(asmDebugIsFailPointActive(ASM_IMPORT_MAIN_CHANNEL, task->state))) { char buf[1]; - shutdown(conn->fd, SHUT_RDWR); + shutdown(connGetFd(conn), SHUT_RDWR); connRead(conn, buf, 1); } @@ -1693,7 +1693,7 @@ void asmSlotSnapshotAndStreamStart(struct asmTask *task) { if (task == NULL || task->state != ASM_WAIT_BGSAVE_START) return; if (unlikely(asmDebugIsFailPointActive(ASM_MIGRATE_RDB_CHANNEL, task->state))) { - shutdown(task->rdb_channel_client->conn->fd, SHUT_RDWR); + shutdown(connGetFd(task->rdb_channel_client->conn), SHUT_RDWR); return; } task->main_channel_client->replstate = SLAVE_STATE_SEND_BULK_AND_STREAM; @@ -2449,7 +2449,7 @@ static void asmSyncBufferReadFromConn(connection *conn) { /* ASM_ACCUMULATE_BUF and ASM_STREAMING_BUF fail points are handled here */ if (unlikely(asmDebugIsFailPointActive(ASM_IMPORT_MAIN_CHANNEL, task->state))) - shutdown(conn->fd, SHUT_RDWR); + shutdown(connGetFd(conn), SHUT_RDWR); replDataBuf *buf = &task->sync_buffer; if (task->state == ASM_STREAMING_BUF) { @@ -2551,7 +2551,7 @@ void asmSyncBufferStreamToDb(asmTask *task) { task->dest_offset); if (unlikely(asmDebugIsFailPointActive(ASM_IMPORT_MAIN_CHANNEL, task->state))) - shutdown(task->main_channel_conn->fd, SHUT_RDWR); /* Simulate a failure */ + shutdown(connGetFd(task->main_channel_conn), SHUT_RDWR); /* Simulate a failure */ /* ACK offset after streaming buffer is done. */ asmImportSendACK(task); @@ -2577,7 +2577,7 @@ void asmSendStreamEofIfDrained(asmTask *task) { serverLog(LL_NOTICE, "Slot migration command stream drained, sending STREAM-EOF to the destination"); if (unlikely(asmDebugIsFailPointActive(ASM_MIGRATE_MAIN_CHANNEL, task->state))) - shutdown(c->conn->fd, SHUT_RDWR); + shutdown(connGetFd(c->conn), SHUT_RDWR); /* Send STREAM-EOF to indicate the end of the stream. */ char *err = sendCommand(c->conn, "CLUSTER", "SYNCSLOTS", "STREAM-EOF", NULL); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index c93aea2ad6b..abf052d7a37 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1221,7 +1221,7 @@ void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) { * one of the links. The existing link is more likely the outdated one, but it's * possible the other node may need to open another link. */ serverLog(LL_DEBUG, "Replacing inbound link fd %d from node %.40s with fd %d", - node->inbound_link->conn->fd, node->name, link->conn->fd); + connGetFd(node->inbound_link->conn), node->name, connGetFd(link->conn)); freeClusterLink(node->inbound_link); } serverAssert(!node->inbound_link); diff --git a/src/config.c b/src/config.c index 97fa58a11d5..01145e23928 100644 --- a/src/config.c +++ b/src/config.c @@ -3304,6 +3304,8 @@ standardConfig static_configs[] = { createIntConfig("cluster-slot-migration-max-archived-tasks", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 1, INT_MAX, server.asm_max_archived_tasks, 32, INTEGER_CONFIG, NULL, NULL), createIntConfig("lookahead", NULL, MODIFIABLE_CONFIG, 1, INT_MAX, server.lookahead, REDIS_DEFAULT_LOOKAHEAD, INTEGER_CONFIG, NULL, NULL), createIntConfig("slowlog-entry-max-argc", NULL, MODIFIABLE_CONFIG, 2, INT_MAX, server.slowlog_max_argc, 32, INTEGER_CONFIG, NULL, NULL), + createIntConfig("repl-compression", NULL, MODIFIABLE_CONFIG, 0, 22, server.repl_compression, 0, INTEGER_CONFIG, NULL, NULL), + createIntConfig("compression-max-latency", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.compression_max_latency, 100, INTEGER_CONFIG, NULL, NULL), /* 100ms */ /* Unsigned int configs */ createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, server.maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients), diff --git a/src/connection.c b/src/connection.c index 38c7564fa5b..191edfae15f 100644 --- a/src/connection.c +++ b/src/connection.c @@ -68,6 +68,8 @@ int connTypeInitialize(void) { /* may fail if without BUILD_TLS=yes */ RedisRegisterConnectionTypeTLS(); + RedisRegisterConnectionTypeCompression(); + return C_OK; } diff --git a/src/connection.h b/src/connection.h index 3d3691edb4e..9310c2362f7 100644 --- a/src/connection.h +++ b/src/connection.h @@ -40,6 +40,7 @@ typedef enum { #define CONN_TYPE_SOCKET "tcp" #define CONN_TYPE_UNIX "unix" #define CONN_TYPE_TLS "tls" +#define CONN_TYPE_COMPRESSION "compression" #define CONN_TYPE_MAX 8 /* 8 is enough to be extendable */ typedef void (*ConnectionCallbackFunc)(struct connection *conn); @@ -81,6 +82,8 @@ typedef struct ConnectionType { ssize_t (*sync_write)(struct connection *conn, char *ptr, ssize_t size, long long timeout); ssize_t (*sync_read)(struct connection *conn, char *ptr, ssize_t size, long long timeout); ssize_t (*sync_readline)(struct connection *conn, char *ptr, ssize_t size, long long timeout); + size_t (*get_last_read)(struct connection *conn); + size_t (*get_last_written)(struct connection *conn); /* event loop */ void (*unbind_event_loop)(struct connection *conn); @@ -95,6 +98,17 @@ typedef struct ConnectionType { /* Get peer username based on connection type */ sds (*get_peer_username)(connection *conn); + + aeEventLoop* (*get_event_loop)(connection *conn); + void (*unset_event_loop)(connection *conn); + int (*get_fd)(connection *conn); + int (*get_iovcnt)(connection *conn); + ConnectionState (*get_state)(connection *conn); + int (*get_last_errno)(connection *conn); + int (*has_read_handler)(connection *conn); + int (*has_write_handler)(connection *conn); + void (*set_private_data)(connection *conn, void *data); + void* (*get_private_data)(connection *conn); } ConnectionType; struct connection { @@ -257,13 +271,33 @@ static inline ssize_t connSyncReadLine(connection *conn, char *ptr, ssize_t size return conn->type->sync_readline(conn, ptr, size, timeout); } +/* If connection type has a special way to get last read bytes from connection + * save the value in last_read and return 1. Otherwise return 0. + * F.e when compression is enabled we read compressed data from socket inside + * connRead but it returns the number of bytes decompressed. In order to get the + * number of bytes read from socket we call connCheckLastRead. */ +static inline int connCheckLastRead(connection *conn, size_t *last_read) { + if (!conn->type->get_last_read) return 0; + *last_read = conn->type->get_last_read(conn); + return 1; +} + +/* If connection type has a special way to get last written bytes to connection + * save the value in last_written and return 1. Otherwise return 0. */ +static inline int connCheckLastWritten(connection *conn, size_t *last_written) { + if (!conn->type->get_last_written) return 0; + *last_written = conn->type->get_last_written(conn); + return 1; +} + /* Return CONN_TYPE_* for the specified connection */ static inline const char *connGetType(connection *conn) { return conn->type->get_type(conn); } static inline int connLastErrorRetryable(connection *conn) { - return conn->last_errno == EINTR; + int last_errno = conn->type->get_last_errno ? conn->type->get_last_errno(conn) : conn->last_errno; + return last_errno == EINTR; } /* Get address information of a connection. @@ -314,34 +348,52 @@ static inline int connIsLocal(connection *conn) { return -1; } -static inline int connGetState(connection *conn) { - return conn->state; +static inline ConnectionState connGetState(connection *conn) { + return conn->type->get_state ? conn->type->get_state(conn) : conn->state; +} + +static inline int connGetFd(connection *conn) { + return conn->type->get_fd ? conn->type->get_fd(conn) : conn->fd; +} + +static inline int connGetIovcnt(connection *conn) { + return conn->type->get_iovcnt ? conn->type->get_iovcnt(conn) : conn->iovcnt; } /* Returns true if a write handler is registered */ static inline int connHasWriteHandler(connection *conn) { - return conn->write_handler != NULL; + return conn->type->has_write_handler ? + conn->type->has_write_handler(conn) : + (conn->write_handler != NULL); } /* Returns true if a read handler is registered */ static inline int connHasReadHandler(connection *conn) { - return conn->read_handler != NULL; + return conn->type->has_read_handler ? + conn->type->has_read_handler(conn) : + (conn->read_handler != NULL); } /* Returns true if the connection is bound to an event loop */ static inline int connHasEventLoop(connection *conn) { - return conn->el != NULL; + return conn->type->get_event_loop ? + (conn->type->get_event_loop(conn) != NULL) : + (conn->el != NULL); } /* Unbind the current event loop from the connection, so that it can be * rebind to a different event loop in the future. */ static inline void connUnbindEventLoop(connection *conn) { - if (conn->el == NULL) return; + if (!connHasEventLoop(conn)) return; connSetReadHandler(conn, NULL); connSetWriteHandler(conn, NULL); if (conn->type->unbind_event_loop) conn->type->unbind_event_loop(conn); - conn->el = NULL; + + if (conn->type->unset_event_loop) + conn->type->unset_event_loop(conn); + else + conn->el = NULL; } /* Rebind the connection to another event loop, read/write handlers must not @@ -352,12 +404,17 @@ static inline int connRebindEventLoop(connection *conn, aeEventLoop *el) { /* Associate a private data pointer with the connection */ static inline void connSetPrivateData(connection *conn, void *data) { - conn->private_data = data; + if (conn->type->set_private_data) + conn->type->set_private_data(conn, data); + else + conn->private_data = data; } /* Get the associated private data pointer */ static inline void *connGetPrivateData(connection *conn) { - return conn->private_data; + return conn->type->get_private_data ? + conn->type->get_private_data(conn) : + conn->private_data; } /* Return a text that describes the connection, suitable for inclusion @@ -366,7 +423,8 @@ static inline void *connGetPrivateData(connection *conn) { * For sockets, we always return "fd=" to maintain compatibility. */ static inline const char *connGetInfo(connection *conn, char *buf, size_t buf_len) { - snprintf(buf, buf_len-1, "fd=%i", conn == NULL ? -1 : conn->fd); + int fd = conn ? connGetFd(conn) : -1; + snprintf(buf, buf_len-1, "fd=%i", conn == NULL ? -1 : fd); return buf; } @@ -462,6 +520,7 @@ sds getListensInfoString(sds info); int RedisRegisterConnectionTypeSocket(void); int RedisRegisterConnectionTypeUnix(void); int RedisRegisterConnectionTypeTLS(void); +int RedisRegisterConnectionTypeCompression(void); /* Return 1 if connection is using TLS protocol, 0 if otherwise. */ static inline int connIsTLS(connection *conn) { diff --git a/src/iothread.c b/src/iothread.c index 3ee3e674b12..594dd598ee2 100644 --- a/src/iothread.c +++ b/src/iothread.c @@ -10,6 +10,12 @@ #include "server.h" +#define IO_DEFAULT_HZ CONFIG_DEFAULT_HZ + +/* Replicates the behaviour of run_with_period used in serverCron but for + * IO threads. IO threads use default Hz for now. */ +#define run_with_period_io(_t_, _ms_) _run_with_period((_t_)->cronloops, (_ms_), IO_DEFAULT_HZ) + /* IO threads. */ IOThread IOThreads[IO_THREADS_MAX_NUM]; @@ -124,6 +130,13 @@ void enqueuePendingClientsToMainThread(client *c, int unbind) { listUnlinkNode(t->clients, c->io_thread_client_list_node); listLinkNodeTail(t->pending_clients_to_main_thread, c->io_thread_client_list_node); c->io_thread_client_list_node = NULL; + + if (listSearchKey(t->compression_clients, c) == + &c->io_thread_compression_clients_node) + { + listUnlinkNode(t->compression_clients, + &c->io_thread_compression_clients_node); + } } } @@ -149,6 +162,17 @@ void enqueuePendingClienstToIOThreads(client *c) { c->io_lastinteraction = c->lastinteraction; } + /* Check here prevents data races with IO thread which may also check + * this flag. */ + if (!(c->io_flags & CLIENT_IO_COMPRESSION_ENABLED)) { + if (c->compression_level > 0) { + clientEnableCompression(c, COMPRESS); + } + if (c->flags & CLIENT_MASTER && server.repl_master_compression_level > 0) { + clientEnableCompression(c, DECOMPRESS); + } + } + c->running_tid = c->tid; listAddNodeHead(mainThreadPendingClientsToIOThreads[c->tid], c); } @@ -158,10 +182,24 @@ void enqueuePendingClienstToIOThreads(client *c) { void unbindClientFromIOThreadEventLoop(client *c) { serverAssert(c->tid != IOTHREAD_MAIN_THREAD_ID && c->running_tid == IOTHREAD_MAIN_THREAD_ID); - if (!connHasEventLoop(c->conn)) return; + /* If the client is not bound to an event loop there is nothing to do, + * unless the client uses repl compression in which case we need to unlink + * it from IO Thread compression_clients list. */ + if (!connHasEventLoop(c->conn) && !c->compression_state) return; + /* As calling in main thread, we should pause the io thread to make it safe. */ pauseIOThread(c->tid); connUnbindEventLoop(c->conn); + IOThread *t = &IOThreads[c->tid]; + /* We need to remove the client from the compression_clients list so it + * won't be processed in IOThreadCompressionCron anymore */ + if (listSearchKey(t->compression_clients, c) == + &c->io_thread_compression_clients_node) + { + listUnlinkNode(t->compression_clients, + &c->io_thread_compression_clients_node); + clientDisableCompression(c); + } resumeIOThread(c->tid); } @@ -304,7 +342,7 @@ void assignClientToIOThread(client *c) { * write, and then put it in the list, main thread will send these clients * to IO thread in beforeSleep. */ connUnbindEventLoop(c->conn); - c->io_flags &= ~(CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED); + c->io_flags &= ~(CLIENT_IO_READ_ENABLED | CLIENT_IO_WRITE_ENABLED | CLIENT_IO_COMPRESSION_ENABLED); enqueuePendingClienstToIOThreads(c); } @@ -765,6 +803,14 @@ int processClientsFromMainThread(IOThread *t) { connSetReadHandler(c->conn, readQueryFromClient); } + /* Add the client to the compression clients list. */ + if (c->compression_state != NULL && + listSearchKey(t->compression_clients, c) == NULL) + { + listLinkNodeTail(t->compression_clients, + &c->io_thread_compression_clients_node); + } + /* If the client has pending replies, write replies to client. */ if (clientHasPendingReplies(c)) { writeToClient(c, 0); @@ -825,7 +871,7 @@ void IOThreadClientsCron(IOThread *t) { /* Process at least a few clients while we are at it, even if we need * to process less than CLIENTS_CRON_MIN_ITERATIONS to meet our contract * of processing each client once per second. */ - int iterations = listLength(t->clients) / CONFIG_DEFAULT_HZ; + int iterations = listLength(t->clients) / IO_DEFAULT_HZ; if (iterations < CLIENTS_CRON_MIN_ITERATIONS) { iterations = CLIENTS_CRON_MIN_ITERATIONS; } @@ -841,7 +887,48 @@ void IOThreadClientsCron(IOThread *t) { } } -/* This is the IO thread timer interrupt, CONFIG_DEFAULT_HZ times per second. +void IOThreadCompressionCron(IOThread *t) { + if (listLength(t->compression_clients) == 0) return; + + listIter li; + listNode *ln; + listRewind(t->compression_clients, &li); + /* TODO: if compression is generalized for all types of clients this cron + * will need to only process a portion of the clients (similar to + * IOThreadClientsCron) for performance reasons. I.e in such case the + * clientHasPendingCompressionFlush check may be moved directly to + * IOThreadClientsCron. */ + while ((ln = listNext(&li))) { + client *c = listNodeValue(ln); + + if (c->io_flags & CLIENT_IO_CLOSE_ASAP) continue; + serverAssert(c->compression_state); + + /* Usually compressAndWrite will be called at the end of + * consumeAndTryWriteCompressed but when compression maximum latency ms + * have passed we want to force flush to the compression buffer so we + * don't have much delays between writes to the socket */ + if (clientHasPendingCompressionFlush(c)) { + /* Only master/replica clients support client compression for now. */ + serverAssert(c->flags & CLIENT_SLAVE); + + int written = 0; + int err = compressAndWrite(c, &written); + if (err) { + if (connGetState(c->conn) != CONN_STATE_CONNECTED) + freeClientAsync(c); + continue; + } + + if (written > 0) { + c->net_output_bytes += written; + atomicIncr(server.stat_net_repl_output_bytes, written); + } + } + } +} + +/* This is the IO thread timer interrupt, IO_DEFAULT_HZ times per second. * The current responsibility is to detect clients that have been stuck in the * IO thread for too long and hand them over to the main thread for handling. */ int IOThreadCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { @@ -849,10 +936,12 @@ int IOThreadCron(struct aeEventLoop *eventLoop, long long id, void *clientData) UNUSED(id); IOThread *t = clientData; + run_with_period_io(t, server.compression_max_latency) IOThreadCompressionCron(t); + /* Run cron tasks for the clients in the IO thread. */ IOThreadClientsCron(t); - return 1000/CONFIG_DEFAULT_HZ; + return 1000/IO_DEFAULT_HZ; } /* The main function of IO thread, it will run an event loop. The mian thread @@ -894,6 +983,8 @@ void initThreadedIO(void) { t->processing_clients = listCreate(); t->pending_clients_to_main_thread = listCreate(); t->clients = listCreate(); + t->compression_clients = listCreate(); + t->cronloops = 0; atomicSetWithSync(t->paused, IO_THREAD_UNPAUSED); atomicSetWithSync(t->running, 0); diff --git a/src/networking.c b/src/networking.c index 2f5384c3b99..31072f1707f 100644 --- a/src/networking.c +++ b/src/networking.c @@ -227,6 +227,7 @@ client *createClient(connection *conn) { c->sockname = NULL; c->client_list_node = NULL; c->io_thread_client_list_node = NULL; + listInitNode(&c->io_thread_compression_clients_node, c); c->postponed_list_node = NULL; c->client_tracking_redirection = 0; c->client_tracking_prefixes = NULL; @@ -256,6 +257,8 @@ client *createClient(connection *conn) { c->stat_avg_pipeline_length_cnt = 0; c->task = NULL; c->node_id = NULL; + c->compression_level = 0; + c->compression_state = NULL; atomicSet(c->pending_read, 0); return c; } @@ -1911,7 +1914,7 @@ void unlinkClient(client *c) { * the main process is stale. SSL_shutdown() involves a handshake, * and it may block the caller when used with stale TLS state.*/ if (c->flags & CLIENT_REPL_RDB_CHANNEL) - shutdown(c->conn->fd, SHUT_RDWR); + shutdown(connGetFd(c->conn), SHUT_RDWR); else connShutdown(c->conn); } @@ -2199,6 +2202,9 @@ void freeClient(client *c) { listDelNode(server.clients_to_close,ln); } + /* Disable compression if present */ + clientDestroyCompressionState(c); + /* If it is our master that's being disconnected we should make sure * to cache the state to try a partial resynchronization later. * @@ -2216,7 +2222,8 @@ void freeClient(client *c) { /* Log link disconnection with slave */ if (clientTypeIsSlave(c)) { - const char *type = c->flags & CLIENT_REPL_RDB_CHANNEL ? " (rdbchannel)" : ""; + int is_rdb_ch = c->flags & CLIENT_REPL_RDB_CHANNEL; + const char *type = is_rdb_ch ? " (rdbchannel)" : ""; serverLog(LL_NOTICE,"Connection with replica%s %s lost.", type, replicationGetSlaveName(c)); } @@ -2558,7 +2565,7 @@ static payloadHeader *processSentDataInEncodedBuffer(client *c, char *start_ptr, * and 'nwritten' is an output parameter, it means how many bytes server write * to client. */ static int _writevToClient(client *c, ssize_t *nwritten) { - int iovmax = min(IOV_MAX, c->conn->iovcnt); + int iovmax = min(IOV_MAX, connGetIovcnt(c->conn)); struct iovec iov[iovmax]; ReplyIOV reply_iov = {iov, iovmax}; @@ -2717,6 +2724,45 @@ static inline int _writeToClientNonSlave(client *c, ssize_t *nwritten) { return C_OK; } +static inline int _writeToClientSlaveIOThread(client *c, ssize_t *nwritten) { + replBufBlock *o = listNodeValue(c->io_curr_repl_node); + /* The IO thread must not send data beyond the bound position. */ + size_t pos = c->io_curr_repl_node == c->io_bound_repl_node ? + c->io_bound_block_pos : o->used; + if (pos > c->io_curr_block_pos) { + int consumed = connWrite(c->conn, o->buf+c->io_curr_block_pos, + pos-c->io_curr_block_pos); + + /* Note, that consumed is how much bytes we've read from the repl buffer, + * where as the bytes we've written into the socket may be different if + * connCheckLastWritten returns so (f.e compression case) */ + /* TODO: if compression is generalized for all types of clients we will + * need to add this check in writeToClientNonSlave also */ + size_t last_written = 0; + if (connCheckLastWritten(c->conn, &last_written)) { + *nwritten += last_written; + /* Since nwritten stores the number of compressed bytes written to + * socket we also store the uncompressed size for stats. */ + atomicIncr(server.stat_net_repl_uncompressed_bytes, consumed); + } else { + *nwritten += consumed; + } + if (consumed <= 0) return C_ERR; + + /* Advance the block position with consumed, because that's how much + * bytes were read from the repl buffer node. *nwritten stores how + * much bytes were written to the socket, which for the compression + * case would most certainly be a different number. */ + c->io_curr_block_pos += consumed; + } + /* If we fully sent the object and there are more nodes to send, go to the next one. */ + if (c->io_curr_block_pos == pos && c->io_curr_repl_node != c->io_bound_repl_node) { + c->io_curr_repl_node = listNextNode(c->io_curr_repl_node); + c->io_curr_block_pos = 0; + } + return C_OK; +} + /* This function does actual writing output buffers for slave client types, * it is called by writeToClient. * If we write successfully, it returns C_OK, otherwise, C_ERR is returned, @@ -2727,22 +2773,7 @@ static inline int _writeToClientSlave(client *c, ssize_t *nwritten) { serverAssert(c->bufpos == 0 && listLength(c->reply) == 0); if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { - replBufBlock *o = listNodeValue(c->io_curr_repl_node); - /* The IO thread must not send data beyond the bound position. */ - size_t pos = c->io_curr_repl_node == c->io_bound_repl_node ? - c->io_bound_block_pos : o->used; - if (pos > c->io_curr_block_pos) { - *nwritten = connWrite(c->conn, o->buf+c->io_curr_block_pos, - pos-c->io_curr_block_pos); - if (*nwritten <= 0) return C_ERR; - c->io_curr_block_pos += *nwritten; - } - /* If we fully sent the object and there are more nodes to send, go to the next one. */ - if (c->io_curr_block_pos == pos && c->io_curr_repl_node != c->io_bound_repl_node) { - c->io_curr_repl_node = listNextNode(c->io_curr_repl_node); - c->io_curr_block_pos = 0; - } - return C_OK; + return _writeToClientSlaveIOThread(c, nwritten); } replBufBlock *o = listNodeValue(c->ref_repl_buf_node); @@ -2754,6 +2785,7 @@ static inline int _writeToClientSlave(client *c, ssize_t *nwritten) { if (*nwritten <= 0) return C_ERR; c->ref_block_pos += *nwritten; } + /* If we fully sent the object on head, go to the next one. */ listNode *next = listNextNode(c->ref_repl_buf_node); if (next && c->ref_block_pos == o->used) { @@ -3821,7 +3853,13 @@ void readQueryFromClient(connection *conn) { int nread, big_arg = 0; size_t qblen, readlen; - if (!(c->io_flags & CLIENT_IO_READ_ENABLED)) { + /* We have to read compressed data but compression for this client is + * currently disabled. This could happened f.e when client was just fetched + * to main thread. */ + int pending_compression_read = c->compression_state && + !(c->io_flags & CLIENT_IO_COMPRESSION_ENABLED); + + if (!(c->io_flags & CLIENT_IO_READ_ENABLED) || pending_compression_read) { atomicSetWithSync(c->pending_read, 1); return; } else if (server.io_threads_num > 1) { @@ -3928,6 +3966,16 @@ void readQueryFromClient(connection *conn) { * c->lastinteraction will be updated during processClientsFromIOThread */ c->io_lastinteraction = server.unixtime; + size_t network_read; + if (!connCheckLastRead(c->conn, &network_read)) { + network_read = nread; + } else { + /* In case of compression nread is the number of decompressed bytes, + * whereas network_read stores the actual number of bytes read from + * socket. */ + atomicIncr(server.stat_net_repl_decompressed_bytes, nread); + } + if (c->flags & CLIENT_MASTER) { if (c->running_tid == IOTHREAD_MAIN_THREAD_ID) { c->read_reploff += nread; @@ -3935,11 +3983,11 @@ void readQueryFromClient(connection *conn) { /* Same comment as for c->io_lastinteraction */ c->io_read_reploff += nread; } - atomicIncr(server.stat_net_repl_input_bytes, nread); + atomicIncr(server.stat_net_repl_input_bytes, network_read); } else { - atomicIncr(server.stat_net_input_bytes, nread); + atomicIncr(server.stat_net_input_bytes, network_read); } - c->net_input_bytes += nread; + c->net_input_bytes += network_read; if (!(c->flags & CLIENT_MASTER) && /* The commands cached in the MULTI/EXEC queue have not been executed yet, diff --git a/src/replication.c b/src/replication.c index aaedabd123f..1a79c48406b 100644 --- a/src/replication.c +++ b/src/replication.c @@ -146,7 +146,8 @@ void putReplicasInPendingClientsToIOThreads(void) { * also send them to the IO thread. */ if (replica->flags & CLIENT_PENDING_WRITE || clientHasPendingReplies(replica) || - replicaFromIOThreadHasPendingRead(replica)) + replicaFromIOThreadHasPendingRead(replica) || + clientHasPendingCompressionFlush(replica)) { enqueuePendingClienstToIOThreads(replica); } @@ -1482,6 +1483,27 @@ void replconfCommand(client *c) { server.repl_diskless_sync) { c->slave_capa |= SLAVE_CAPA_RDB_CHANNEL_REPL; } + } else if (!strcasecmp(c->argv[j]->ptr, "compression")) { + long level; + + if ((getLongFromObjectOrReply(c,c->argv[j+1], &level,NULL) != C_OK)) + return; + + if (server.io_threads_num > 1) { + serverLog(LL_NOTICE, "Client #%llu request for replication " + "compression with level %ld accepted", + (unsigned long long)c->id, level); + + c->compression_level = level; + } else { + serverLog(LL_NOTICE, "Client #%llu request for replication " + "compression rejected. Replication compression is only " + "enabled with IO threads.", + (unsigned long long)c->id); + c->compression_level = 0; + addReplyError(c, "Master has not enabled replication compression"); + return; + } } else if (!strcasecmp(c->argv[j]->ptr,"ack")) { /* REPLCONF ACK is used by slave to inform the master the amount * of replication stream that it processed so far. It is an @@ -1518,6 +1540,7 @@ void replconfCommand(client *c) { * that 'ack' might be received before we detect bgsave is done. */ if (c->replstate == SLAVE_STATE_SEND_BULK_AND_STREAM) replicaPutOnline(c); + /* Note: this command does not reply anything! */ return; } else if (!strcasecmp(c->argv[j]->ptr,"getack")) { @@ -2700,6 +2723,12 @@ void readSyncBulkPayload(connection *conn) { if (server.repl_backlog == NULL) createReplicationBacklog(); serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Finished with success"); + /* If we agreed on compression with the master then setup compression on the + * master client. At this point we're done reading all non compressed payload + * from the master */ + if (server.repl_master_compression_level) + clientEnableCompression(server.master, DECOMPRESS); + if (server.supervised_mode == SUPERVISED_SYSTEMD) { redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Finished with success. Ready to accept connections in read-write mode.\n"); } @@ -3182,6 +3211,14 @@ void syncWithMaster(connection *conn) { if (err) goto write_error; } + /* Try to setup client compression if we're configured so */ + if (server.repl_compression > 0) { + sds level = sdsfromlonglong(server.repl_compression); + err = sendCommand(conn, "REPLCONF", "compression", level, NULL); + sdsfree(level); + if (err) goto write_error; + } + /* Inform the master of our (slave) capabilities. * * EOF: supports EOF-style RDB transfer for diskless replication. @@ -3250,7 +3287,7 @@ void syncWithMaster(connection *conn) { } if (server.repl_state == REPL_STATE_RECEIVE_REQ_REPLY && !no_compress_checksum) - server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; + server.repl_state = REPL_STATE_RECEIVE_CLIENT_COMP; /* Receive REPLCONF REQUEST reply (rdb-no-compress and rdb-no-checksum). */ if (server.repl_state == REPL_STATE_RECEIVE_REQ_REPLY) { @@ -3263,6 +3300,32 @@ void syncWithMaster(connection *conn) { "REPLCONF rdb-no-compress/checksum: %s", err); } sdsfree(err); + server.repl_state = REPL_STATE_RECEIVE_CLIENT_COMP; + return; + } + + if (server.repl_state == REPL_STATE_RECEIVE_CLIENT_COMP && server.repl_compression <= 0) + server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; + + if (server.repl_state == REPL_STATE_RECEIVE_CLIENT_COMP) { + err = receiveSynchronousResponse(conn); + if (err == NULL) goto no_response_error; + + /* Ignore the error if any, not all the Redis versions support + * REPLCONF compression. */ + if (err[0] == '-') { + serverLog(LL_NOTICE, + "(Non critical) Replication compression not enabled: %s", + err); + server.repl_compression = 0; + server.repl_master_compression_level = 0; + } else { + serverLog(LL_NOTICE, "Master agreed to compression level %d", + server.repl_compression); + server.repl_master_compression_level = server.repl_compression; + } + sdsfree(err); + err = NULL; server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; return; } @@ -3334,6 +3397,9 @@ void syncWithMaster(connection *conn) { if (psync_result == PSYNC_CONTINUE) { serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization."); + if (server.repl_master_compression_level > 0) { + clientEnableCompression(server.master, DECOMPRESS); + } if (server.supervised_mode == SUPERVISED_SYSTEMD) { redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections in read-write mode.\n"); } @@ -4126,6 +4192,9 @@ int replDataBufStreamToDb(replDataBuf *buf, replDataBufToDbCtx *ctx) { int ret = C_OK; client *c = ctx->client; + if (server.repl_master_compression_level > 0) + serverAssert(server.master->compression_state); + blockingOperationStarts(); while ((n = listFirst(buf->blocks))) { replDataBufBlock *o = listNodeValue(n); @@ -4134,30 +4203,53 @@ int replDataBufStreamToDb(replDataBuf *buf, replDataBufToDbCtx *ctx) { size_t processed = 0; while (processed < o->used) { - size_t bytes = min(PROTO_IOBUF_LEN, o->used - processed); - c->querybuf = sdscatlen(c->querybuf, &o->buf[processed], bytes); - c->read_reploff += (long long int) bytes; + /* Consumed bytes from the current block in the current iteration. */ + size_t consumed = 0; + + if (server.repl_master_compression_level > 0) { + c->querybuf = sdsMakeRoomFor(c->querybuf, PROTO_IOBUF_LEN); + + size_t qblen = sdslen(c->querybuf); + size_t avail = sdsavail(c->querybuf); + int decompressed = readFromBufAndDecompress(c, o->buf + processed, + o->used - processed, + c->querybuf + qblen, + avail, + &consumed); + serverAssert(decompressed >= 0); + sdsIncrLen(c->querybuf, decompressed); + c->read_reploff += (long long) decompressed; + c->io_read_reploff += (long long int) decompressed; + + atomicIncr(server.stat_net_repl_decompressed_bytes, decompressed); + } else { + size_t bytes = min(PROTO_IOBUF_LEN, o->used - processed); + c->querybuf = sdscatlen(c->querybuf, &o->buf[processed], bytes); + c->read_reploff += (long long int) bytes; + c->io_read_reploff += (long long int) bytes; + consumed = bytes; + } c->lastinteraction = server.unixtime; /* We don't expect error return value but just in case. */ ret = processInputBuffer(c); if (ret != C_OK) break; - processed += bytes; - buf->used -= bytes; + processed += consumed; + buf->used -= consumed; if (server.repl_debug_pause & REPL_DEBUG_ON_STREAMING_REPL_BUF) debugPauseProcess(); /* Check if we should yield back to the event loop */ if (server.loading_process_events_interval_bytes && - ((ctx->applied_offset + bytes) / server.loading_process_events_interval_bytes > + ((ctx->applied_offset + consumed) / server.loading_process_events_interval_bytes > ctx->applied_offset / server.loading_process_events_interval_bytes)) { ctx->yield_callback(ctx); processEventsWhileBlocked(); } - ctx->applied_offset += bytes; + ctx->applied_offset += consumed; /* Check if we should continue processing */ if (!ctx->should_continue(ctx)) { diff --git a/src/server.c b/src/server.c index df660175e09..b0a541c30cd 100644 --- a/src/server.c +++ b/src/server.c @@ -2915,6 +2915,8 @@ void resetServerStats(void) { atomicSet(server.stat_net_output_bytes, 0); atomicSet(server.stat_net_repl_input_bytes, 0); atomicSet(server.stat_net_repl_output_bytes, 0); + atomicSet(server.stat_net_repl_uncompressed_bytes, 0); + atomicSet(server.stat_net_repl_decompressed_bytes, 0); server.stat_unexpected_error_replies = 0; server.stat_total_error_replies = 0; server.stat_dump_payload_sanitizations = 0; @@ -6642,6 +6644,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { long long stat_total_client_process_input_buff_events; long long stat_avg_pipeline_length_sum; long long stat_avg_pipeline_length_cnt; + long long stat_net_repl_uncompressed_bytes, stat_net_repl_decompressed_bytes; long long current_eviction_exceeded_time = server.stat_last_eviction_exceeded_time ? (long long) elapsedUs(server.stat_last_eviction_exceeded_time): 0; long long current_active_defrag_time = server.stat_last_active_defrag_time ? @@ -6655,6 +6658,8 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { atomicGet(server.stat_total_client_process_input_buff_events, stat_total_client_process_input_buff_events); atomicGet(server.stat_avg_pipeline_length_sum, stat_avg_pipeline_length_sum); atomicGet(server.stat_avg_pipeline_length_cnt, stat_avg_pipeline_length_cnt); + atomicGet(server.stat_net_repl_uncompressed_bytes, stat_net_repl_uncompressed_bytes); + atomicGet(server.stat_net_repl_decompressed_bytes, stat_net_repl_decompressed_bytes); /* If we calculated the total reads and writes in the threads section, * we don't need to do it again, and also keep the values consistent. */ @@ -6747,6 +6752,12 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { if (!server.cluster_enabled && server.cluster_compatibility_sample_ratio) { info = sdscatprintf(info, "cluster_incompatible_ops:%lld\r\n", server.stat_cluster_incompatible_ops); } + if (stat_net_repl_uncompressed_bytes > 0) { + info = sdscatprintf(info, "total_net_repl_uncompressed_bytes:%lld\r\n", stat_net_repl_uncompressed_bytes); + } + if (stat_net_repl_decompressed_bytes > 0) { + info = sdscatprintf(info, "total_net_repl_decompressed_bytes:%lld\r\n", stat_net_repl_decompressed_bytes); + } } /* Replication */ diff --git a/src/server.h b/src/server.h index 9318eec686d..39a31fb5f21 100644 --- a/src/server.h +++ b/src/server.h @@ -67,6 +67,7 @@ typedef long long ustime_t; /* microsecond time type. */ #include "connection.h" /* Connection abstraction */ #include "eventnotifier.h" /* Event notification */ #include "memory_prefetch.h" +#include "client_comp.h" /* Forward declarations needed by redismodule.h and keymeta.h */ struct redisObject; @@ -460,6 +461,8 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CLIENT_IO_REUSABLE_QUERYBUFFER (1ULL<<3) /* The client is using the reusable query buffer. */ #define CLIENT_IO_CLOSE_ASAP (1ULL<<4) /* Close this client ASAP in IO thread. */ #define CLIENT_IO_PENDING_CRON (1ULL<<5) /* The client is pending cron job, to be processed in main thread. */ +#define CLIENT_IO_COMPRESSION_ENABLED (1ULL<<6) /* The client compression is enabled for this client*/ +#define CLIENT_IO_READ_DECOMPRESSED_CRON (1ULL<<7) /* The client is reading only decompressed data inside its querybuf */ /* Definitions for client read errors. These error codes are used to indicate * various issues that can occur while reading or parsing data from a client. */ @@ -527,6 +530,7 @@ typedef enum { REPL_STATE_RECEIVE_IP_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_REQ_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_CAPA_REPLY, /* Wait for REPLCONF reply */ + REPL_STATE_RECEIVE_CLIENT_COMP, /* Wait for CLIENT_COMP reply */ REPL_STATE_SEND_PSYNC, /* Send PSYNC */ REPL_STATE_RECEIVE_PSYNC_REPLY, /* Wait for PSYNC reply */ /* --- End of handshake states --- */ @@ -810,10 +814,12 @@ typedef enum { #endif #define NOTIFY_ALL (NOTIFY_GENERIC | NOTIFY_STRING | NOTIFY_LIST | NOTIFY_SET | NOTIFY_HASH | NOTIFY_ZSET | NOTIFY_EXPIRED | NOTIFY_EVICTED | NOTIFY_STREAM | NOTIFY_MODULE | NOTIFY_ARRAY) /* A flag */ +#define _run_with_period(_cronloops_, _ms_, _hz_) if (((_ms_) <= 1000/(_hz_)) || !((_cronloops_)%((_ms_)/(1000/(_hz_))))) + /* Using the following macro you can run code inside serverCron() with the * specified period, specified in milliseconds. * The actual resolution depends on server.hz. */ -#define run_with_period(_ms_) if (((_ms_) <= 1000/server.hz) || !(server.cronloops%((_ms_)/(1000/server.hz)))) +#define run_with_period(_ms_) _run_with_period(server.cronloops, (_ms_), server.hz) /* We can print the stacktrace, so our assert is defined this way: */ #define serverAssertWithInfo(_c,_o,_e) (likely(_e)?(void)0 : (_serverAssertWithInfo(_c,_o,#_e,__FILE__,__LINE__),redis_unreachable())) @@ -1581,6 +1587,7 @@ typedef struct client { sds sockname; /* Cached connection target address. */ listNode *client_list_node; /* list node in client list */ listNode *io_thread_client_list_node; /* list node in io thread client list */ + listNode io_thread_compression_clients_node; /* list node in io thread compression clients list */ listNode *postponed_list_node; /* list node within the postponed list */ void *module_blocked_client; /* Pointer to the RedisModuleBlockedClient associated with this * client. This is set in case of module authentication before the @@ -1597,7 +1604,10 @@ typedef struct client { void *auth_module; /* The module that owns the callback, which is used * to disconnect the client if the module is * unloaded for cleanup. Opaque for Redis Core.*/ - + compressionState *compression_state; /* Opauqe handle to compression state */ + int compression_level; /* Compression level (0 means no compresison). + * Currently not relevant for non-replication + * connections. */ /* If this client is in tracking mode and this field is non zero, * invalidation messages for keys fetched by this client will be sent to * the specified client ID. */ @@ -1679,6 +1689,8 @@ typedef struct __attribute__((aligned(CACHE_LINE_SIZE))) { list *clients; /* IO thread managed clients. */ redisAtomic long long io_reads_processed; /* Number of read events processed */ redisAtomic long long io_writes_processed; /* Number of write events processed */ + list *compression_clients; /* Clients that write/read compressed data */ + size_t cronloops; } IOThread; extern IOThread IOThreads[IO_THREADS_MAX_NUM]; @@ -2147,6 +2159,8 @@ struct redisServer { redisAtomic long long stat_net_output_bytes; /* Bytes written to network. */ redisAtomic long long stat_net_repl_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ redisAtomic long long stat_net_repl_output_bytes; /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */ + redisAtomic long long stat_net_repl_uncompressed_bytes; /* Bytes read from repl buffer before being compressed and written during replication. */ + redisAtomic long long stat_net_repl_decompressed_bytes; /* Decompressed bytes after reading compressed data during replication. */ size_t stat_current_cow_peak; /* Peak size of copy on write bytes. */ size_t stat_current_cow_bytes; /* Copy on write bytes while child is active. */ monotime stat_current_cow_updated; /* Last update time of stat_current_cow_bytes */ @@ -2409,6 +2423,8 @@ struct redisServer { char master_replid[CONFIG_RUN_ID_SIZE+1]; /* Master PSYNC runid. */ long long master_initial_offset; /* Master PSYNC offset. */ int repl_slave_lazy_flush; /* Lazy FLUSHALL before loading DB? */ + int repl_compression; /* Should slave attempt compressed replication link */ + int repl_master_compression_level; /* Compression level agreed with master */ /* Synchronous replication. */ list *clients_waiting_acks; /* Clients waiting in WAIT or WAITAOF. */ int get_ack_from_slaves; /* If true we send REPLCONF GETACK. */ @@ -2429,6 +2445,7 @@ struct redisServer { int oom_score_adj_values[CONFIG_OOM_COUNT]; /* Linux oom_score_adj configuration */ int oom_score_adj; /* If true, oom_score_adj is managed */ int disable_thp; /* If true, disable THP by syscall */ + int compression_max_latency; /* flush interval for client compression in ms */ /* Blocked clients */ unsigned int blocked_clients; /* # of clients executing a blocking cmd.*/ unsigned int blocked_clients_by_type[BLOCKED_NUM]; diff --git a/src/socket.c b/src/socket.c index 226b414f85e..4920ac3ff80 100644 --- a/src/socket.c +++ b/src/socket.c @@ -423,6 +423,8 @@ static ConnectionType CT_Socket = { .sync_write = connSocketSyncWrite, .sync_read = connSocketSyncRead, .sync_readline = connSocketSyncReadLine, + .get_last_read = NULL, + .get_last_written = NULL, /* pending data */ .has_pending_data = NULL, @@ -430,36 +432,43 @@ static ConnectionType CT_Socket = { }; int connBlock(connection *conn) { - if (conn->fd == -1) return C_ERR; - return anetBlock(NULL, conn->fd); + int fd = connGetFd(conn); + if (fd == -1) return C_ERR; + return anetBlock(NULL, fd); } int connNonBlock(connection *conn) { - if (conn->fd == -1) return C_ERR; - return anetNonBlock(NULL, conn->fd); + int fd = connGetFd(conn); + if (fd == -1) return C_ERR; + return anetNonBlock(NULL, fd); } int connEnableTcpNoDelay(connection *conn) { - if (conn->fd == -1) return C_ERR; - return anetEnableTcpNoDelay(NULL, conn->fd); + int fd = connGetFd(conn); + if (fd == -1) return C_ERR; + return anetEnableTcpNoDelay(NULL, fd); } int connDisableTcpNoDelay(connection *conn) { - if (conn->fd == -1) return C_ERR; - return anetDisableTcpNoDelay(NULL, conn->fd); + int fd = connGetFd(conn); + if (fd == -1) return C_ERR; + return anetDisableTcpNoDelay(NULL, fd); } int connKeepAlive(connection *conn, int interval) { - if (conn->fd == -1) return C_ERR; - return anetKeepAlive(NULL, conn->fd, interval); + int fd = connGetFd(conn); + if (fd == -1) return C_ERR; + return anetKeepAlive(NULL, fd, interval); } int connSendTimeout(connection *conn, long long ms) { - return anetSendTimeout(NULL, conn->fd, ms); + int fd = connGetFd(conn); + return anetSendTimeout(NULL, fd, ms); } int connRecvTimeout(connection *conn, long long ms) { - return anetRecvTimeout(NULL, conn->fd, ms); + int fd = connGetFd(conn); + return anetRecvTimeout(NULL, fd, ms); } int RedisRegisterConnectionTypeSocket(void) diff --git a/src/tls.c b/src/tls.c index 0537dfef377..f506d327e13 100644 --- a/src/tls.c +++ b/src/tls.c @@ -1235,6 +1235,8 @@ static ConnectionType CT_TLS = { .sync_write = connTLSSyncWrite, .sync_read = connTLSSyncRead, .sync_readline = connTLSSyncReadLine, + .get_last_read = NULL, + .get_last_written = NULL, /* pending data */ .has_pending_data = tlsHasPendingData, diff --git a/src/unix.c b/src/unix.c index e26f0d27ad1..6aad6c91ad4 100644 --- a/src/unix.c +++ b/src/unix.c @@ -207,6 +207,8 @@ static ConnectionType CT_Unix = { .sync_write = connUnixSyncWrite, .sync_read = connUnixSyncRead, .sync_readline = connUnixSyncReadLine, + .get_last_read = NULL, + .get_last_written = NULL, /* pending data */ .has_pending_data = NULL, diff --git a/tests/instances.tcl b/tests/instances.tcl index 8f0a23bd298..35c3ef71352 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -41,6 +41,7 @@ set ::run_matching {} ; # If non empty, only tests matching pattern are run. set ::stop_on_failure 0 set ::loop 0 set ::tsan 0 +set ::compression 0 if {[catch {cd tmp}]} { puts "tmp directory not found." @@ -313,6 +314,8 @@ proc parse_options {} { set ::force_resp3 1 } elseif {$opt eq {--tsan}} { set ::tsan 1 + } elseif {$opt eq {--compression}} { + set ::compression 1 } elseif {$opt eq "--help"} { puts "--single Only runs tests specified by pattern." puts "--dont-clean Keep log files on exit." diff --git a/tests/integration/psync2-master-restart.tcl b/tests/integration/psync2-master-restart.tcl index 5971e74e12d..d1640b90a32 100644 --- a/tests/integration/psync2-master-restart.tcl +++ b/tests/integration/psync2-master-restart.tcl @@ -90,7 +90,7 @@ start_server {} { restart_server 0 true false true now set master [srv 0 client] } - wait_for_condition 50 1000 { + wait_for_condition 100 1000 { [status $replica master_link_status] eq {up} && [status $sub_replica master_link_status] eq {up} } else { @@ -131,7 +131,6 @@ start_server {} { $offset == [status $replica master_repl_offset] && $offset == [status $sub_replica master_repl_offset] } else { - show_cluster_status fail "Replicas and master offsets were unable to match *exactly*." } @@ -196,7 +195,7 @@ start_server {} { after 20 - wait_for_condition 500 100 { + wait_for_condition 1000 100 { [status $master master_repl_offset] == [status $replica master_repl_offset] && [status $master master_repl_offset] == [status $sub_replica master_repl_offset] } else { diff --git a/tests/integration/replication-buffer.tcl b/tests/integration/replication-buffer.tcl index 11e604c7562..5c224e400b7 100644 --- a/tests/integration/replication-buffer.tcl +++ b/tests/integration/replication-buffer.tcl @@ -76,7 +76,17 @@ start_server {} { # replication buffer, so all replicas output memory is not # more than double of replication buffer. set repl_buf_mem [s mem_total_replication_buffers] - set extra_mem [expr {[s used_memory]-$before_used-1024*1024}] + set repl_full_sync_buf_mem [s mem_replica_full_sync_buffer] + + set compression_extra_mem 0 + if {$::compression} { + # each replica holds 2 temp user-space buffers of size 128Kb related + # to compression + set numreplicas 3 + set compression_extra_mem [expr {(2 * 128) * 1024 * $numreplicas}] + } + set extra_mem [expr {[s used_memory] - $before_used - 1024*1024 - $compression_extra_mem}] + if {$rdbchannel == "yes"} { # master's replication buffers should not grow assert {$extra_mem < 1024*1024} @@ -162,6 +172,15 @@ start_server {} { # Generating RDB will take 1000 seconds $master config set rdb-key-save-delay 1000000 populate 1000 master 10000 + + # When compression is enabled the repl buffer may be consumed by the + # compression library faster than the data is actually send. We still + # trim the replication buffer in that case. In order for the following + # test to work we rely on replica2 being slow. If repl-rdb-channel is + # enabled though the repl buffer may be consumed faster than we expect. + if {$::compression} { + $replica2 config set repl-rdb-channel no + } $replica2 replicaof $master_host $master_port # Make sure replica2 is waiting bgsave wait_for_condition 5000 100 { @@ -174,6 +193,11 @@ start_server {} { # the slow replica2 kept replication buffer. populate 20000 master 10000 assert {[s repl_backlog_histlen] > [expr 10000*10000]} + + # revert the config + if {$::compression} { + $replica2 config set repl-rdb-channel no + } } # Wait replica1 catch up with the master @@ -224,10 +248,12 @@ start_server {} { # Since we trim replication backlog inrementally, replication backlog # memory may take time to be reclaimed. - wait_for_condition 1000 100 { - [s repl_backlog_histlen] < [expr 10000*10000] - } else { - fail "Replication backlog memory is not smaller" + if {!$::compression} { + wait_for_condition 1000 100 { + [s repl_backlog_histlen] < [expr 10000*10000] + } else { + fail "Replication backlog memory is not smaller" + } } resume_process $replica2_pid } diff --git a/tests/integration/replication-rdbchannel.tcl b/tests/integration/replication-rdbchannel.tcl index 0fc955bd6f0..531a8edd246 100644 --- a/tests/integration/replication-rdbchannel.tcl +++ b/tests/integration/replication-rdbchannel.tcl @@ -299,8 +299,12 @@ start_server {tags {"repl external:skip debug_defrag:skip"}} { assert_lessthan [expr $peak_master_used_mem - $prev_used - $backlog_size] 1000000 assert_lessthan $peak_master_rpl_buf [expr {$backlog_size + 1000000}] assert_lessthan $peak_master_slave_buf_size 1000000 - # buffers in the replica are more than 5mb - assert_morethan $peak_replica_buf_size 5000000 + # buffers in the replica are more than 5mb. Skip when testing + # compression as replica buf contains compressed data in that case + # which is a lot smaller + if {$::compression == 0} { + assert_morethan $peak_replica_buf_size 5000000 + } stop_write_load $load_handle } @@ -343,11 +347,17 @@ start_server {tags {"repl external:skip"}} { fail "replica didn't start sync" } - # Create some traffic on replication stream - populate 100 master 100000 + # Create some traffic on replication stream. + # In case of compression generate a command stream that is not well + # compressed so we can reach the buffer limits easier. + if {$::compression} { + populate 1000 master 500000 0 false 0 true + } else { + populate 100 master 100000 + } # Wait for replica's buffer limit reached - wait_for_log_messages -1 {"*Replication buffer limit has been reached*"} 0 1000 10 + wait_for_log_messages -1 {"*Replication buffer limit has been reached*"} 0 100 10 # Speed up loading $replica config set key-load-delay 0 @@ -361,7 +371,12 @@ start_server {tags {"repl external:skip"}} { } # Verify sync was not interrupted. - assert_equal [s 0 sync_full] [expr $prev_sync_full + 1] + # In the compression case the master's output buffer limits could be + # reached when replica stops accumulating command stream since we + # are sending a lot more data in this case. + if {$::compression == 0} { + assert_equal [s 0 sync_full] [expr $prev_sync_full + 1] + } # Verify db's are identical assert_morethan [$master dbsize] 0 @@ -434,7 +449,11 @@ start_server {tags {"repl external:skip debug_defrag:skip"}} { } # Generate replication traffic of ~20mb to disconnect the slave on obuf limit - populate 20 master 1000000 -1 + if {$::compression} { + populate 200 master 1000000 -1 false 0 true + } else { + populate 20 master 1000000 -1 + } wait_for_log_messages -1 {"*Client * closed * for overcoming of output buffer limits.*"} $loglines 1000 10 $replica config set key-load-delay 0 diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index 0611a970eed..697d392544b 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -108,13 +108,13 @@ start_server {tags {"repl external:skip"}} { $B config set loglevel debug r set test foo assert_equal [r getset test bar] foo - wait_for_condition 500 10 { + wait_for_condition 1000 10 { [$A get test] eq "bar" } else { fail "getset wasn't propagated" } assert_equal [r set test vaz get] bar - wait_for_condition 500 10 { + wait_for_condition 1000 10 { [$A get test] eq "vaz" } else { fail "set get wasn't propagated" @@ -241,8 +241,8 @@ start_server {tags {"repl external:skip"}} { s role } {slave} - wait_for_sync r test {Sync should have transferred keys from master} { + wait_for_sync r r get mykey } {foo} @@ -367,7 +367,7 @@ foreach mdl {no yes} rdbchannel {no yes} { # Wait for all the three slaves to reach the "online" # state from the POV of the master. - set retry 500 + set retry 1000 while {$retry} { set info [r -3 info] if {[string match {*slave0:*state=online*slave1:*state=online*slave2:*state=online*} $info]} { @@ -810,7 +810,7 @@ test {diskless loading short read} { for {set i 0} {$i < $attempts} {incr i} { # wait for the replica to start reading the rdb # using the log file since the replica only responds to INFO once in 2mb - set res [wait_for_log_messages -1 {"*Loading DB in memory*"} $loglines 2000 1] + set res [wait_for_log_messages -1 {"*Loading DB in memory*"} $loglines 10000 1] set loglines [lindex $res 1] # add some additional random sleep so that we kill the master on a different place each time @@ -1551,7 +1551,7 @@ start_server {tags {"repl external:skip"}} { foreach disklessload {disabled on-empty-db} { test "Replica should reply LOADING while flushing a large db (disklessload: $disklessload)" { - start_server {} { + start_server {tags {"repl"}} { set replica [srv 0 client] start_server {} { set master [srv 0 client] diff --git a/tests/support/test.tcl b/tests/support/test.tcl index 01ea8e717aa..3d416d451fe 100644 --- a/tests/support/test.tcl +++ b/tests/support/test.tcl @@ -126,6 +126,9 @@ proc assert_refcount_morethan {key ref} { # max retries and delay between retries. Otherwise the 'elsescript' is # executed. proc wait_for_condition {maxtries delay e _else_ elsescript} { + if {$::compression} { + set maxtries [expr $maxtries * 3] + } if {$_else_ ne "else"} { error "$_else_ must be equal to \"else\"" } diff --git a/tests/support/util.tcl b/tests/support/util.tcl index e46da150ad7..ef5a1a75cbd 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -130,7 +130,7 @@ proc waitForBgrewriteaof r { } proc wait_for_sync r { - set maxtries 50 + set maxtries 200 # tsan adds significant overhead to the execution time, so we increase the # wait time here JIC if {$::tsan} { @@ -150,6 +150,9 @@ proc wait_replica_online {r {replica_id 0} {maxtries 50} {delay 100}} { if {$::tsan} { set maxtries [expr {$maxtries * 2}] } + if {$::compression} { + set maxtries [expr {$maxtries * 3}] + } wait_for_condition $maxtries $delay { [string match "*slave$replica_id:*,state=online*" [$r info replication]] @@ -165,6 +168,9 @@ proc wait_for_ofs_sync {r1 r2} { if {$::tsan} { set maxtries 100 } + if {$::compression} { + set maxtries 150 + } wait_for_condition $maxtries 100 { [status $r1 master_repl_offset] eq [status $r2 master_repl_offset] } else { @@ -221,6 +227,9 @@ proc verify_log_message {srv_idx pattern from_line} { # wait for pattern to be found in server's stdout after certain line number # return value is a list containing the line that matched the pattern and the line number proc wait_for_log_messages {srv_idx patterns from_line maxtries delay} { + if {$::compression} { + set maxtries [expr {$maxtries * 3}] + } set retry $maxtries set next_line [expr $from_line + 1] ;# searching form the line after set stdout [srv $srv_idx stdout] @@ -656,11 +665,19 @@ proc stop_bg_complex_data {handle} { # Write num keys with the given key prefix and value size (in bytes). If idx is # given, it's the index (AKA level) used with the srv procedure and it specifies # to which Redis instance to write the keys. -proc populate {num {prefix key:} {size 3} {idx 0} {prints false} {expires 0}} { +proc populate {num {prefix key:} {size 3} {idx 0} {prints false} {expires 0} {random false}} { r $idx deferred 1 if {$num > 16} {set pipeline 16} else {set pipeline $num} - set val [string repeat A $size] + if {$random} { + set baseval [randstring $size $size] + } else { + set val [string repeat A $size] + } for {set j 0} {$j < $pipeline} {incr j} { + if {$random} { + set jstr [format "%08d" $j] + set val [string replace $baseval 0 7 $jstr] + } if {$expires > 0} { r $idx set $prefix$j $val ex $expires } else { @@ -669,6 +686,10 @@ proc populate {num {prefix key:} {size 3} {idx 0} {prints false} {expires 0}} { if {$prints} {puts $j} } for {} {$j < $num} {incr j} { + if {$random} { + set jstr [format "%08d" $j] + set val [string replace $baseval 0 7 $jstr] + } if {$expires > 0} { r $idx set $prefix$j $val ex $expires } else { @@ -754,7 +775,11 @@ proc resume_process {pid} { after 100 } - wait_for_condition 50 1000 { + set max_attempts 50 + if {$::compression} { + set max_attempts 150 + } + wait_for_condition $max_attempts 1000 { [string match "R*" [exec ps -o state= -p $pid]] || [string match "S*" [exec ps -o state= -p $pid]] } else { diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 2f73df1c469..f7fb27b47ef 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -90,6 +90,7 @@ set ::large_memory 0 set ::log_req_res 0 set ::force_resp3 0 set ::debug_defrag 0 +set ::compression 0 # Set to 1 when we are running in client mode. The Redis test uses a # server-client model to run tests simultaneously. The server instance @@ -728,6 +729,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} { set ::ignoredigest 1 } elseif {$opt eq {--debug-defrag}} { set ::debug_defrag 1 + } elseif {$opt eq {--compression}} { + set ::compression 1 } elseif {$opt eq {--help}} { print_help_screen exit 0 From 51c013037e003c496ebd33b893a1a3c78173c567 Mon Sep 17 00:00:00 2001 From: Mincho Paskalev Date: Wed, 25 Mar 2026 12:03:53 +0200 Subject: [PATCH 2/7] Fix mem leaks and minor issues from review --- src/client_comp.c | 6 ++++-- src/replication.c | 19 ++++++++++++++----- src/server.h | 2 +- tests/integration/replication-buffer.tcl | 2 +- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/client_comp.c b/src/client_comp.c index 6cf8ae0d5fb..a4a23433de5 100644 --- a/src/client_comp.c +++ b/src/client_comp.c @@ -190,6 +190,7 @@ static void zstdEnd(compressionState *st) { * anyways */ output.pos = 0; } + zfree(tmp); } ZSTD_freeCStream(st->ctx.zstdCCtx); st->ctx.zstdCCtx = NULL; @@ -342,7 +343,7 @@ static int connCompressionRead(struct connection *conn, void *buf, size_t buf_le } if (curr <= 0 && nread <= 0) break; - } while (decompressed <= buf_len); + } while (decompressed < buf_len); if (decompressed == 0 && connGetState(cc->underlying) == CONN_STATE_CONNECTED) return -1; @@ -613,7 +614,7 @@ int RedisRegisterConnectionTypeCompression(void) { /* Create compression state for the client */ int compressionStateCreate(client *c) { - compressionState *st = zmalloc(sizeof(compressionState)); + compressionState *st = zcalloc(sizeof(compressionState)); st->type = &zstdType; st->last_write = 0; st->write_flush_pending = 0; @@ -694,6 +695,7 @@ void clientDestroyCompressionState(client *c) { * underlying connection. */ if (c->conn) { compressionConnection *cc = (compressionConnection*)c->conn; + compressionPendingRemove(cc); c->conn = cc->underlying; zfree(cc); diff --git a/src/replication.c b/src/replication.c index 1a79c48406b..a3d91607fde 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1490,11 +1490,20 @@ void replconfCommand(client *c) { return; if (server.io_threads_num > 1) { - serverLog(LL_NOTICE, "Client #%llu request for replication " - "compression with level %ld accepted", - (unsigned long long)c->id, level); - - c->compression_level = level; + if (level == (long)server.repl_compression) { + serverLog(LL_NOTICE, "Client #%llu request for replication " + "compression with level %ld accepted", + (unsigned long long)c->id, level); + + c->compression_level = level; + } else { + serverLog(LL_NOTICE, "Client #%llu request for replication " + "compression rejected. Compression levels differ.", + (unsigned long long)c->id); + c->compression_level = 0; + addReplyErrorFormat(c, "Requested level %ld differs from master's level: %d", + level, server.repl_compression); + } } else { serverLog(LL_NOTICE, "Client #%llu request for replication " "compression rejected. Replication compression is only " diff --git a/src/server.h b/src/server.h index 39a31fb5f21..adaf6c951e2 100644 --- a/src/server.h +++ b/src/server.h @@ -529,8 +529,8 @@ typedef enum { REPL_STATE_RECEIVE_PORT_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_IP_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_REQ_REPLY, /* Wait for REPLCONF reply */ - REPL_STATE_RECEIVE_CAPA_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_CLIENT_COMP, /* Wait for CLIENT_COMP reply */ + REPL_STATE_RECEIVE_CAPA_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_SEND_PSYNC, /* Send PSYNC */ REPL_STATE_RECEIVE_PSYNC_REPLY, /* Wait for PSYNC reply */ /* --- End of handshake states --- */ diff --git a/tests/integration/replication-buffer.tcl b/tests/integration/replication-buffer.tcl index 5c224e400b7..b57f1289d80 100644 --- a/tests/integration/replication-buffer.tcl +++ b/tests/integration/replication-buffer.tcl @@ -196,7 +196,7 @@ start_server {} { # revert the config if {$::compression} { - $replica2 config set repl-rdb-channel no + $replica2 config set repl-rdb-channel yes } } From caade6aa2fc680835c0f06428c61f57de39e008e Mon Sep 17 00:00:00 2001 From: Mincho Paskalev Date: Thu, 26 Mar 2026 11:25:58 +0200 Subject: [PATCH 3/7] Fix review comments --- src/client_comp.c | 2 +- src/iothread.c | 2 ++ src/networking.c | 3 ++- src/replication.c | 1 + 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/client_comp.c b/src/client_comp.c index a4a23433de5..e7bb7ac6a32 100644 --- a/src/client_comp.c +++ b/src/client_comp.c @@ -388,13 +388,13 @@ static int connCompressionWrite(connection *conn, const void *data, size_t len) /* Write whatever we have available in the compressed buffer */ int written = 0; int err = compressAndWrite(c, &written); - cc->last_written += written; if (err) { if (connGetState(c->conn) != CONN_STATE_CONNECTED) { return -1; } return consumed; } + cc->last_written += written; if (written == 0 && state->output.written == state->output.consumed) break; diff --git a/src/iothread.c b/src/iothread.c index 594dd598ee2..41b08e9d205 100644 --- a/src/iothread.c +++ b/src/iothread.c @@ -941,6 +941,8 @@ int IOThreadCron(struct aeEventLoop *eventLoop, long long id, void *clientData) /* Run cron tasks for the clients in the IO thread. */ IOThreadClientsCron(t); + t->cronloops++; + return 1000/IO_DEFAULT_HZ; } diff --git a/src/networking.c b/src/networking.c index 31072f1707f..f75751e38bf 100644 --- a/src/networking.c +++ b/src/networking.c @@ -2733,6 +2733,8 @@ static inline int _writeToClientSlaveIOThread(client *c, ssize_t *nwritten) { int consumed = connWrite(c->conn, o->buf+c->io_curr_block_pos, pos-c->io_curr_block_pos); + if (consumed <= 0) return C_ERR; + /* Note, that consumed is how much bytes we've read from the repl buffer, * where as the bytes we've written into the socket may be different if * connCheckLastWritten returns so (f.e compression case) */ @@ -2747,7 +2749,6 @@ static inline int _writeToClientSlaveIOThread(client *c, ssize_t *nwritten) { } else { *nwritten += consumed; } - if (consumed <= 0) return C_ERR; /* Advance the block position with consumed, because that's how much * bytes were read from the repl buffer node. *nwritten stores how diff --git a/src/replication.c b/src/replication.c index a3d91607fde..a709a4c367c 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1503,6 +1503,7 @@ void replconfCommand(client *c) { c->compression_level = 0; addReplyErrorFormat(c, "Requested level %ld differs from master's level: %d", level, server.repl_compression); + return; } } else { serverLog(LL_NOTICE, "Client #%llu request for replication " From 71fe177d9ecd4716f62c71ad95fbdec3c71254e5 Mon Sep 17 00:00:00 2001 From: Mincho Paskalev Date: Thu, 26 Mar 2026 12:08:00 +0200 Subject: [PATCH 4/7] remove unused client flag --- src/server.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/server.h b/src/server.h index adaf6c951e2..9803d0fc23d 100644 --- a/src/server.h +++ b/src/server.h @@ -462,7 +462,6 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CLIENT_IO_CLOSE_ASAP (1ULL<<4) /* Close this client ASAP in IO thread. */ #define CLIENT_IO_PENDING_CRON (1ULL<<5) /* The client is pending cron job, to be processed in main thread. */ #define CLIENT_IO_COMPRESSION_ENABLED (1ULL<<6) /* The client compression is enabled for this client*/ -#define CLIENT_IO_READ_DECOMPRESSED_CRON (1ULL<<7) /* The client is reading only decompressed data inside its querybuf */ /* Definitions for client read errors. These error codes are used to indicate * various issues that can occur while reading or parsing data from a client. */ From 7aee551979c6660fecda1670df4818b9b664c83a Mon Sep 17 00:00:00 2001 From: Mincho Paskalev Date: Fri, 27 Mar 2026 14:22:25 +0200 Subject: [PATCH 5/7] Fix maxattempts count in wait_for_sync proc in tests --- tests/support/util.tcl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/support/util.tcl b/tests/support/util.tcl index ef5a1a75cbd..146e7d63674 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -130,12 +130,16 @@ proc waitForBgrewriteaof r { } proc wait_for_sync r { - set maxtries 200 + set maxtries 50 # tsan adds significant overhead to the execution time, so we increase the # wait time here JIC if {$::tsan} { set maxtries 100 } + # same for compression + if {$::compression} { + set maxtries 200 + } wait_for_condition $maxtries 100 { [status $r master_link_status] eq "up" From e95e2517b55c36fcb76a03c1bfa9b718fa8dfb63 Mon Sep 17 00:00:00 2001 From: Mincho Paskalev Date: Tue, 2 Jun 2026 12:27:02 +0300 Subject: [PATCH 6/7] fix minor issues --- .github/workflows/daily.yml | 2 +- src/client_comp.c | 2 ++ src/replication.c | 8 +++++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 5b0e102b0d1..250da230478 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -504,7 +504,7 @@ jobs: echo "skiptests: ${{github.event.inputs.skiptests}}" echo "test_args: ${{github.event.inputs.test_args}}" echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} diff --git a/src/client_comp.c b/src/client_comp.c index e7bb7ac6a32..8041bb9cae7 100644 --- a/src/client_comp.c +++ b/src/client_comp.c @@ -653,6 +653,7 @@ int clientCreateCompressionState(client *c, compressionDirection dir) { if (st->type->init_compress(st, c->compression_level) == -1) { compressionStateDestroy(c->compression_state); + c->compression_state = NULL; return 0; } @@ -665,6 +666,7 @@ int clientCreateCompressionState(client *c, compressionDirection dir) { if (st->type->init_decompress(st) == -1) { compressionStateDestroy(c->compression_state); + c->compression_state = NULL; return 0; } diff --git a/src/replication.c b/src/replication.c index a709a4c367c..73f4ad851e6 100644 --- a/src/replication.c +++ b/src/replication.c @@ -4248,8 +4248,14 @@ int replDataBufStreamToDb(replDataBuf *buf, replDataBufToDbCtx *ctx) { processed += consumed; buf->used -= consumed; - if (server.repl_debug_pause & REPL_DEBUG_ON_STREAMING_REPL_BUF) + if (server.repl_debug_pause & REPL_DEBUG_ON_STREAMING_REPL_BUF) { debugPauseProcess(); + /* Always process events after debug break as tests may have + * ran other debug commands we need to process and it's not + * always possible to enter the below condition relying on + * `server.loading_process_events_interval_bytes`. */ + processEventsWhileBlocked(); + } /* Check if we should yield back to the event loop */ if (server.loading_process_events_interval_bytes && From 6ea000bc9f8f82bbf7637c8cf891f4062de0679b Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Thu, 11 Jun 2026 15:09:56 +0800 Subject: [PATCH 7/7] test --- src/ae.h | 2 +- src/client_comp.c | 598 ++++++++++++------------------------------- src/client_comp.h | 15 +- src/cluster_asm.c | 12 +- src/cluster_legacy.c | 2 +- src/connection.c | 2 - src/connection.h | 81 +----- src/iothread.c | 18 +- src/networking.c | 96 ++++--- src/socket.c | 33 +-- src/tls.c | 2 - src/unix.c | 2 - 12 files changed, 279 insertions(+), 584 deletions(-) diff --git a/src/ae.h b/src/ae.h index 4d6de8049a9..996d48b0f83 100644 --- a/src/ae.h +++ b/src/ae.h @@ -89,7 +89,7 @@ typedef struct aeEventLoop { aeBeforeSleepProc *beforesleep; aeBeforeSleepProc *aftersleep; int flags; - void *privdata[3]; + void *privdata[2]; } aeEventLoop; /* Prototypes */ diff --git a/src/client_comp.c b/src/client_comp.c index 8041bb9cae7..c0529823fe7 100644 --- a/src/client_comp.c +++ b/src/client_comp.c @@ -31,11 +31,15 @@ struct compressionState { } ctx; int write_flush_pending; /* write flush not yet completed */ int read_flush_pending; /* read flush not yet completed */ + int handle_pending; /* Set while processing pending decompressed data, in + * which case we must not read from the socket. */ mstime_t last_write; /* Time since last write. Used to check if it's time * to flush the buffer */ compressionDirection dir; }; +static int decompressInto(compressionState *state, char *buf, size_t buflen); + /* --- zstd --- */ static int zstdInitCompress(compressionState *st, int level) { @@ -208,425 +212,20 @@ static const compressionType zstdType = { .end = zstdEnd, }; -/* Compression connection. It wraps over existing connection in order to add - * compression capability during read/write operations. The `base` member is only - * used for getting the ConnectionType but whenever we do any operation on the - * connection we use the underlying pointer which is the actual connection. - * See CT_Compression. */ -typedef struct compressionConnection { - connection base; - connection *underlying; - listNode *pending_data_node; - size_t last_read; - size_t last_written; - int handle_pending; -} compressionConnection; - -int decompressInto(compressionState *state, char *buf, size_t buflen); - -static void compressionPendingAdd(compressionConnection *cc) { - if (cc->pending_data_node) return; - - if (!cc->underlying->el->privdata[2]) { - cc->underlying->el->privdata[2] = listCreate(); - } - - list *l = cc->underlying->el->privdata[2]; - listAddNodeTail(l, cc); - cc->pending_data_node = listLast(l); -} - -static void compressionPendingRemove(compressionConnection *cc) { - if (!cc->pending_data_node) return; - - list *l = cc->underlying->el->privdata[2]; - if (l && listLength(l) > 0 && listSearchKey(l, cc) == cc->pending_data_node) { - listDelNode(l, cc->pending_data_node); - } else if (cc->pending_data_node) { - zfree(cc->pending_data_node); - } - cc->pending_data_node = NULL; -} - -static const char *connCompressionGetType(connection *conn) { - UNUSED(conn); - return CONN_TYPE_COMPRESSION; -} - -static void connCompressionAeHandler(aeEventLoop *el, int fd, void *clientData, int mask) { - compressionConnection *cc = (compressionConnection *)clientData; - cc->underlying->type->ae_handler(el, fd, clientData, mask); -} - -static int connCompressionAddr(connection *conn, char *ip, size_t ip_len, int *port, int remote) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->addr(cc->underlying, ip, ip_len, port, remote); -} - -static int connCompressionIsLocal(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->is_local(cc->underlying); -} - -static void connCompressionShutdown(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - cc->underlying->type->shutdown(cc->underlying); -} - -static void connCompressionClose(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - cc->underlying->type->close(cc->underlying); - zfree(conn); -} - -static int connCompressionConnect(connection *conn, const char *addr, int port, const char *source_addr, ConnectionCallbackFunc connect_handler) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->connect(cc->underlying, addr, port, source_addr, connect_handler); -} - -static int connCompressionBlockingConnect(connection *conn, const char *addr, int port, long long timeout) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->blocking_connect(cc->underlying, addr, port, timeout); -} - -static int connCompressionAccept(connection *conn, ConnectionCallbackFunc accept_handler) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->accept(cc->underlying, accept_handler); -} - -/* If decompression is initialized for a connection it reads compressed data - * from the underlying connection (i.e usually from socket) into a temp buffer - * and decompresses into the passed `buf`. Tries to read and decompress as much - * as possible. - * There are some scenarios though that we may still have pending data to process - * (see clientHasPendingCompressedData). In such cases the connection is added - * to the event-loop's pending data and processed via connProcessPendingData. */ -static int connCompressionRead(struct connection *conn, void *buf, size_t buf_len) { - compressionConnection *cc = (compressionConnection*)conn; - client *c = connGetPrivateData(conn); - compressionState *state = c->compression_state; - if (!state || state->dir != DECOMPRESS) - return connRead(cc->underlying, buf, buf_len); - - cc->last_read = 0; - size_t decompressed = 0; - do { - int curr = decompressInto(state, (char*)buf + decompressed, buf_len - decompressed); - /* Decompression error, we should close the connection */ - if (curr < 0) { - cc->underlying->state = CONN_STATE_CLOSED; - break; - } - decompressed += curr; - - int nread = 0; - /* If the handle_pending flag is raised we only decompress whatever data - * we have read from the socket without reading anything more. Socket - * reading will happened when the event loop handles read event in which - * case the handle_pending flags wouldn't be raised. */ - if (!cc->handle_pending) { - nread = cc->underlying->type->read( - cc->underlying, - state->input.data + state->input.written, - state->input.size - state->input.written); - - if (nread < 0 && connGetState(cc->underlying) == CONN_STATE_ERROR) { - cc->last_read = -1; - return -1; - } - /* Even if nread == 0 we continue the loop until decompressInto has - * nothing more it can do. */ - if (nread > 0) { - cc->last_read += nread; - state->input.written += nread; - } - } - - if (curr <= 0 && nread <= 0) break; - } while (decompressed < buf_len); - - if (decompressed == 0 && connGetState(cc->underlying) == CONN_STATE_CONNECTED) - return -1; - - /* No need to give pending work to main thread as the client must be send to - * IO-thread soon enough. */ - if (c->running_tid != IOTHREAD_MAIN_THREAD_ID) { - if (connGetState(conn) == CONN_STATE_CONNECTED && clientHasPendingCompressedData(c)) { - compressionPendingAdd(cc); - } else if (cc->pending_data_node) { - compressionPendingRemove(cc); - } - } - - return decompressed; -} - -/* Compress all bytes from `data` and pass the compressed data to the underlying's - * connection write method. */ -static int connCompressionWrite(connection *conn, const void *data, size_t len) { - compressionConnection *cc = (compressionConnection*)conn; - client *c = connGetPrivateData(conn); - compressionState *state = c->compression_state; - /* If no compression state or we are not compressing data - defer to - * underlying connection. */ - if (!state || state->dir != COMPRESS) - return connWrite(cc->underlying, data, len); - - int consumed = 0; - cc->last_written = 0; - while ((size_t)consumed != len) { - int to_consume = - min(state->input.size - state->input.written, (int)(len - consumed)); - serverAssert(to_consume >= 0); - - memcpy(state->input.data + state->input.written, - (char*)data + consumed, to_consume); - - state->input.written += to_consume; - consumed += to_consume; - - /* Write whatever we have available in the compressed buffer */ - int written = 0; - int err = compressAndWrite(c, &written); - if (err) { - if (connGetState(c->conn) != CONN_STATE_CONNECTED) { - return -1; - } - return consumed; - } - cc->last_written += written; - - if (written == 0 && state->output.written == state->output.consumed) - break; - } - - return consumed; -} - -static int connCompressionWritev(connection *conn, const struct iovec *iov, int iovcnt) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->writev(cc->underlying, iov, iovcnt); -} - -static int connCompressionSetWriteHandler(connection *conn, ConnectionCallbackFunc handler, int barrier) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->set_write_handler(cc->underlying, handler, barrier); -} - -static int connCompressionSetReadHandler(connection *conn, ConnectionCallbackFunc handler) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->set_read_handler(cc->underlying, handler); -} - -static const char *connCompressionGetLastError(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->get_last_error(cc->underlying); -} - -static ssize_t connCompressionSyncWrite(connection *conn, char *ptr, ssize_t size, long long timeout) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->sync_write(cc->underlying, ptr, size, timeout); -} - -static ssize_t connCompressionSyncRead(connection *conn, char *ptr, ssize_t size, long long timeout) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->sync_read(cc->underlying, ptr, size, timeout); -} - -static ssize_t connCompressionSyncReadLine(connection *conn, char *ptr, ssize_t size, long long timeout) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->sync_readline(cc->underlying, ptr, size, timeout); -} - -static size_t connCompressionGetLastRead(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->last_read; -} - -static size_t connCompressionGetLastWritten(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->last_written; -} - -static void connCompressionUnbindEventLoop(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - aeEventLoop *el = cc->underlying->el; - if (el) { - int fd = cc->underlying->fd; - int mask = aeGetFileEvents(el, cc->underlying->fd); - if (mask & AE_READABLE) aeDeleteFileEvent(el, fd, AE_READABLE); - if (mask & AE_WRITABLE) aeDeleteFileEvent(el, fd, AE_WRITABLE); - - if (cc->pending_data_node) { - compressionPendingRemove(cc); - } - } -} - -static int connCompressionRebindEventLoop(connection *conn, aeEventLoop *el) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->type->rebind_event_loop(cc->underlying, el); -} - -static aeEventLoop *connCompressionGetEventLoop(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->el; -} - -static void connCompressionUnsetEventLoop(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - cc->underlying->el = NULL; -} - -static int connCompressionGetFd(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->fd; -} - -static int connCompressionGetIovcnt(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->iovcnt; -} - -static ConnectionState connCompressionGetState(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->state; -} - -static int connCompressionGetLastErrno(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->last_errno; -} - -static int connCompressionHasReadHandler(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->read_handler != NULL; -} - -static int connCompressionHasWriteHandler(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->write_handler != NULL; -} - -static void connCompressionSetPrivateData(connection *conn, void *data) { - compressionConnection *cc = (compressionConnection*)conn; - cc->underlying->private_data = data; -} - -static void *connCompressionGetPrivateData(connection *conn) { - compressionConnection *cc = (compressionConnection*)conn; - return cc->underlying->private_data; -} - -static int compressionHasPendingData(struct aeEventLoop *el) { - list *pending_list = el->privdata[2]; - if (!pending_list) - return 0; - return listLength(pending_list) > 0; -} - -/* Handling pending data involves calling the read handler of the underlying - * connection. We don't actually want to call connRead on it as we handle only - * pending data(i.e internal buffers of the compression library not yet flushed - * or non-consumed decompressed data in our temp buffers) and not any actual - * read events. */ -static int compressionProcessPendingData(struct aeEventLoop *el) { - list *pending_list = el->privdata[2]; - if (!pending_list || listLength(pending_list) == 0) - return 0; - - listIter li; - listNode *ln; - int processed = 0; - listRewind(pending_list,&li); - while((ln = listNext(&li))) { - compressionConnection *cc = listNodeValue(ln); - if (!cc || !cc->underlying || !connHasReadHandler((connection*)cc)) continue; - - cc->handle_pending = 1; - cc->underlying->read_handler(cc->underlying); - cc->handle_pending = 0; - - ++processed; - } - return processed; -} - -/* ConnectionType for the compression connection */ -static ConnectionType CT_Compression = { - /* connection type */ - .get_type = connCompressionGetType, - - /* ae & accept & listen & error & address handler */ - .ae_handler = connCompressionAeHandler, - .addr = connCompressionAddr, - .is_local = connCompressionIsLocal, - - /* shutdown/close connection */ - .shutdown = connCompressionShutdown, - .close = connCompressionClose, - - /* connect & accept */ - .connect = connCompressionConnect, - .blocking_connect = connCompressionBlockingConnect, - .accept = connCompressionAccept, - - /* event loop */ - .unbind_event_loop = connCompressionUnbindEventLoop, - .rebind_event_loop = connCompressionRebindEventLoop, - - /* IO */ - .read = connCompressionRead, - .write = connCompressionWrite, - .writev = connCompressionWritev, - .set_write_handler = connCompressionSetWriteHandler, - .set_read_handler = connCompressionSetReadHandler, - .get_last_error = connCompressionGetLastError, - .sync_write = connCompressionSyncWrite, - .sync_read = connCompressionSyncRead, - .sync_readline = connCompressionSyncReadLine, - .get_last_read = connCompressionGetLastRead, - .get_last_written = connCompressionGetLastWritten, - - /* pending data */ - .has_pending_data = compressionHasPendingData, - .process_pending_data = compressionProcessPendingData, - - .get_peer_cert = NULL, - .get_peer_username = NULL, - - /* connection accessors */ - .get_event_loop = connCompressionGetEventLoop, - .unset_event_loop = connCompressionUnsetEventLoop, - .get_fd = connCompressionGetFd, - .get_iovcnt = connCompressionGetIovcnt, - .get_state = connCompressionGetState, - .get_last_errno = connCompressionGetLastErrno, - .has_read_handler = connCompressionHasReadHandler, - .has_write_handler = connCompressionHasWriteHandler, - .set_private_data = connCompressionSetPrivateData, - .get_private_data = connCompressionGetPrivateData, -}; - -int RedisRegisterConnectionTypeCompression(void) { - return connTypeRegister(&CT_Compression); -} - /* Create compression state for the client */ -int compressionStateCreate(client *c) { +static void compressionStateCreate(client *c) { compressionState *st = zcalloc(sizeof(compressionState)); st->type = &zstdType; st->last_write = 0; st->write_flush_pending = 0; st->read_flush_pending = 0; + st->handle_pending = 0; st->dir = CD_INVALID; c->compression_state = st; - - return 1; } -void compressionStateDestroy(compressionState *state) { +static void compressionStateDestroy(compressionState *state) { if (state == NULL) return; state->type->end(state); @@ -639,12 +238,12 @@ void compressionStateDestroy(compressionState *state) { * initialized. `dir` indicates the compression direction, i.e if the client * will compress or decompress data. * Currently only viable for master/replica clients. */ -int clientCreateCompressionState(client *c, compressionDirection dir) { +static int clientCreateCompressionState(client *c, compressionDirection dir) { /* Client compression already initialized */ if (c->compression_state != NULL) return 1; - serverAssert(compressionStateCreate(c)); + compressionStateCreate(c); compressionState *st = c->compression_state; @@ -669,7 +268,7 @@ int clientCreateCompressionState(client *c, compressionDirection dir) { c->compression_state = NULL; return 0; } - + st->dir = DECOMPRESS; serverLog(LL_NOTICE, "Decompression for master client initialized."); @@ -678,31 +277,12 @@ int clientCreateCompressionState(client *c, compressionDirection dir) { serverAssert(0); } - compressionConnection *cc = zmalloc(sizeof(compressionConnection)); - cc->base.type = &CT_Compression; - cc->underlying = c->conn; - cc->pending_data_node = NULL; - cc->last_read = 0; - cc->last_written = 0; - cc->handle_pending = 0; - c->conn = (connection*)cc; - return 1; } void clientDestroyCompressionState(client *c) { if (c->compression_state == NULL) return; - /* If the connection is not destroyed yet we need to switch back to the - * underlying connection. */ - if (c->conn) { - compressionConnection *cc = (compressionConnection*)c->conn; - compressionPendingRemove(cc); - c->conn = cc->underlying; - - zfree(cc); - } - compressionStateDestroy(c->compression_state); c->compression_state = NULL; c->compression_level = 0; @@ -733,11 +313,26 @@ void clientDisableCompression(client *c) { c->io_flags &= ~CLIENT_IO_COMPRESSION_ENABLED; } -/* Compress any data send for compression (see connCompressionWrite) and +/* Return 1 if data written to the client must be compressed. Note that this + * is independent from CLIENT_IO_COMPRESSION_ENABLED: once the compressed + * stream has started the peer expects compressed data, no matter which thread + * the client is currently running on. */ +int clientCompressesWrites(client *c) { + return c->compression_state && c->compression_state->dir == COMPRESS; +} + +/* Return 1 if data read from the client must be decompressed. */ +int clientDecompressesReads(client *c) { + return c->compression_state && c->compression_state->dir == DECOMPRESS; +} + +/* Compress any data fed for compression (see clientCompressAndWriteBuf) and * write to socket. Compression library may not return compressed data * immediately so this call may not write anything to socket. * Force flushes the compressed buffer according to compression_max_latency. - * Return number of bytes written to socket or -1 on socket write error. */ + * Return 0 on success and 1 on error. On a compression error the client is + * scheduled for close as the compressed stream cannot be recovered; on a + * socket error the caller can inspect the connection state. */ int compressAndWrite(client *c, int *tot_written) { if (c->compression_level <= 0) return 0; @@ -760,20 +355,20 @@ int compressAndWrite(client *c, int *tot_written) { * data. */ int flush = mstime() - state->last_write > server.compression_max_latency; if (state->type->compress(state, flush) == -1) { - clientDestroyCompressionState(c); + /* The compressed stream is broken, there is no way to recover: + * close the link and let the replica resync. */ + freeClientAsync(c); return 1; } } - compressionConnection *cc = (compressionConnection*)c->conn; - /* Try to write all the data available in the compressed buffer. */ *tot_written = 0; int towrite = state->output.written - state->output.consumed; do { int written = connWrite( - cc->underlying, state->output.data + state->output.consumed, towrite); + c->conn, state->output.data + state->output.consumed, towrite); if (written < 0) { return 1; } @@ -798,11 +393,105 @@ int compressAndWrite(client *c, int *tot_written) { return 0; } +/* Compress `len` bytes from `data` and write the compressed output to the + * client's connection. The compression library may buffer data internally, + * so consuming input doesn't imply an actual socket write. + * Return the number of bytes consumed from `data` (uncompressed), or -1 on + * connection error. `*socket_written` is set to the number of (compressed) + * bytes actually written to the socket. */ +ssize_t clientCompressAndWriteBuf(client *c, const char *data, size_t len, ssize_t *socket_written) { + compressionState *state = c->compression_state; + serverAssert(state && state->dir == COMPRESS); + + *socket_written = 0; + size_t consumed = 0; + while (consumed != len) { + int to_consume = + min(state->input.size - state->input.written, (int)(len - consumed)); + serverAssert(to_consume >= 0); + + memcpy(state->input.data + state->input.written, + data + consumed, to_consume); + + state->input.written += to_consume; + consumed += to_consume; + + /* Write whatever we have available in the compressed buffer */ + int written = 0; + if (compressAndWrite(c, &written)) { + if (connGetState(c->conn) != CONN_STATE_CONNECTED) { + return -1; + } + return consumed; + } + *socket_written += written; + + if (written == 0 && state->output.written == state->output.consumed) + break; + } + + return consumed; +} + +/* Read compressed data from the client's connection and decompress it into + * `buf`. Tries to read and decompress as much as possible. + * Return the number of decompressed bytes placed in `buf`, 0 if the peer + * closed the connection, or -1 if no data is available or on error (on a + * decompression error `c->read_error` is set as the stream cannot be + * recovered). + * We may not be able to consume all the data we read from the socket in one + * call (e.g. `buf` is full while decompressed data is still pending). Such + * leftovers are processed via compressionProcessPendingReads. */ +int clientReadAndDecompress(client *c, char *buf, size_t buf_len, size_t *socket_read) { + compressionState *state = c->compression_state; + serverAssert(state && state->dir == DECOMPRESS); + + *socket_read = 0; + size_t decompressed = 0; + do { + int curr = decompressInto(state, buf + decompressed, buf_len - decompressed); + if (curr < 0) { + /* Decompression error: the compressed stream is broken, there is + * no way to recover, so close the connection with the master. */ + c->read_error = CLIENT_READ_CONN_DISCONNECTED; + return -1; + } + decompressed += curr; + + int nread = 0; + /* If the handle_pending flag is raised we only decompress whatever data + * we have already read from the socket without reading anything more. + * Socket reading will happen when the event loop handles the read + * event, in which case the handle_pending flag wouldn't be raised. */ + if (!state->handle_pending) { + nread = connRead(c->conn, + state->input.data + state->input.written, + state->input.size - state->input.written); + + if (nread < 0 && connGetState(c->conn) == CONN_STATE_ERROR) + return -1; + /* Even if nread == 0 we continue the loop until decompressInto has + * nothing more it can do. */ + if (nread > 0) { + *socket_read += nread; + state->input.written += nread; + } + } + + if (curr <= 0 && nread <= 0) break; + } while (decompressed < buf_len); + + if (decompressed == 0 && connGetState(c->conn) == CONN_STATE_CONNECTED) + return -1; + + return decompressed; +} + /* Decompress input compressed data and put it in `buf`. If decompressed data * is more than buflen this function must be called again so output data can * be consumed. If buflen is sufficiently large this function will decompress * as much data as possible. */ -int decompressInto(compressionState *state, char *buf, size_t buflen) { +static int decompressInto(compressionState *state, char *buf, size_t buflen) { if (buflen == 0) return 0; @@ -921,3 +610,38 @@ int clientHasPendingCompressedData(client *c) { state->output.written > state->output.consumed; } +/* Process pending decompressed data of the given compression clients. There + * are some scenarios in which a read event doesn't consume all the available + * data (see clientHasPendingCompressedData), e.g. the query buffer filled up + * while decompressed data was still pending. In such cases this function, + * called from the IO thread's beforeSleep, re-invokes the read handler in + * decompress-only mode (no socket reads). + * Return the number of clients that still have pending data afterwards, so + * the caller knows it should not block waiting for events. */ +int compressionProcessPendingReads(list *compression_clients) { + if (listLength(compression_clients) == 0) return 0; + + int pending = 0; + listIter li; + listNode *ln; + listRewind(compression_clients, &li); + while ((ln = listNext(&li))) { + client *c = listNodeValue(ln); + + if (c->io_flags & CLIENT_IO_CLOSE_ASAP) continue; + if (!clientHasPendingCompressedData(c)) continue; + if (!connHasReadHandler(c->conn)) continue; + + compressionState *state = c->compression_state; + state->handle_pending = 1; + readQueryFromClient(c->conn); + state->handle_pending = 0; + + if (!(c->io_flags & CLIENT_IO_CLOSE_ASAP) && + clientHasPendingCompressedData(c)) + { + pending++; + } + } + return pending; +} diff --git a/src/client_comp.h b/src/client_comp.h index a5a308fa937..db0bff531ee 100644 --- a/src/client_comp.h +++ b/src/client_comp.h @@ -12,6 +12,8 @@ #include +#include "adlist.h" + /* Opaque handle to client's compression state used internally by client_comp */ typedef struct compressionState compressionState; @@ -23,12 +25,17 @@ typedef enum { DECOMPRESS, } compressionDirection; -int clientCreateCompressionState(struct client *c, compressionDirection dir); +int clientEnableCompression(struct client *c, compressionDirection dir); +void clientDisableCompression(struct client *c); void clientDestroyCompressionState(struct client *c); -int clientEnableCompression(struct client *c, compressionDirection dir); +int clientCompressesWrites(struct client *c); +int clientDecompressesReads(struct client *c); -void clientDisableCompression(struct client *c); +ssize_t clientCompressAndWriteBuf(struct client *c, const char *data, size_t len, + ssize_t *socket_written); +int clientReadAndDecompress(struct client *c, char *buf, size_t buf_len, + size_t *socket_read); int compressAndWrite(struct client *c, int *tot_written); @@ -39,4 +46,6 @@ int readFromBufAndDecompress(struct client *c, char *input_buf, size_t input_len int clientHasPendingCompressionFlush(struct client *c); int clientHasPendingCompressedData(struct client *c); +int compressionProcessPendingReads(list *compression_clients); + #endif /* __CLIENT_COMP_H */ diff --git a/src/cluster_asm.c b/src/cluster_asm.c index 63d662026fe..ac94a5103c9 100644 --- a/src/cluster_asm.c +++ b/src/cluster_asm.c @@ -1373,7 +1373,7 @@ void asmRdbChannelSyncWithSource(connection *conn) { * surface the error condition. * using shutdown() instead of connShutdown() because connTLSShutdown() * will free the connection directly, which is not what we want. */ - shutdown(connGetFd(conn), SHUT_RDWR); + shutdown(conn->fd, SHUT_RDWR); connRead(conn, buf, 1); } @@ -1524,7 +1524,7 @@ void asmSyncWithSource(connection *conn) { /* Check if the fail point is active for this channel and state */ if (unlikely(asmDebugIsFailPointActive(ASM_IMPORT_MAIN_CHANNEL, task->state))) { char buf[1]; - shutdown(connGetFd(conn), SHUT_RDWR); + shutdown(conn->fd, SHUT_RDWR); connRead(conn, buf, 1); } @@ -1693,7 +1693,7 @@ void asmSlotSnapshotAndStreamStart(struct asmTask *task) { if (task == NULL || task->state != ASM_WAIT_BGSAVE_START) return; if (unlikely(asmDebugIsFailPointActive(ASM_MIGRATE_RDB_CHANNEL, task->state))) { - shutdown(connGetFd(task->rdb_channel_client->conn), SHUT_RDWR); + shutdown(task->rdb_channel_client->conn->fd, SHUT_RDWR); return; } task->main_channel_client->replstate = SLAVE_STATE_SEND_BULK_AND_STREAM; @@ -2449,7 +2449,7 @@ static void asmSyncBufferReadFromConn(connection *conn) { /* ASM_ACCUMULATE_BUF and ASM_STREAMING_BUF fail points are handled here */ if (unlikely(asmDebugIsFailPointActive(ASM_IMPORT_MAIN_CHANNEL, task->state))) - shutdown(connGetFd(conn), SHUT_RDWR); + shutdown(conn->fd, SHUT_RDWR); replDataBuf *buf = &task->sync_buffer; if (task->state == ASM_STREAMING_BUF) { @@ -2551,7 +2551,7 @@ void asmSyncBufferStreamToDb(asmTask *task) { task->dest_offset); if (unlikely(asmDebugIsFailPointActive(ASM_IMPORT_MAIN_CHANNEL, task->state))) - shutdown(connGetFd(task->main_channel_conn), SHUT_RDWR); /* Simulate a failure */ + shutdown(task->main_channel_conn->fd, SHUT_RDWR); /* Simulate a failure */ /* ACK offset after streaming buffer is done. */ asmImportSendACK(task); @@ -2577,7 +2577,7 @@ void asmSendStreamEofIfDrained(asmTask *task) { serverLog(LL_NOTICE, "Slot migration command stream drained, sending STREAM-EOF to the destination"); if (unlikely(asmDebugIsFailPointActive(ASM_MIGRATE_MAIN_CHANNEL, task->state))) - shutdown(connGetFd(c->conn), SHUT_RDWR); + shutdown(c->conn->fd, SHUT_RDWR); /* Send STREAM-EOF to indicate the end of the stream. */ char *err = sendCommand(c->conn, "CLUSTER", "SYNCSLOTS", "STREAM-EOF", NULL); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index abf052d7a37..c93aea2ad6b 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1221,7 +1221,7 @@ void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) { * one of the links. The existing link is more likely the outdated one, but it's * possible the other node may need to open another link. */ serverLog(LL_DEBUG, "Replacing inbound link fd %d from node %.40s with fd %d", - connGetFd(node->inbound_link->conn), node->name, connGetFd(link->conn)); + node->inbound_link->conn->fd, node->name, link->conn->fd); freeClusterLink(node->inbound_link); } serverAssert(!node->inbound_link); diff --git a/src/connection.c b/src/connection.c index 191edfae15f..38c7564fa5b 100644 --- a/src/connection.c +++ b/src/connection.c @@ -68,8 +68,6 @@ int connTypeInitialize(void) { /* may fail if without BUILD_TLS=yes */ RedisRegisterConnectionTypeTLS(); - RedisRegisterConnectionTypeCompression(); - return C_OK; } diff --git a/src/connection.h b/src/connection.h index 9310c2362f7..3d3691edb4e 100644 --- a/src/connection.h +++ b/src/connection.h @@ -40,7 +40,6 @@ typedef enum { #define CONN_TYPE_SOCKET "tcp" #define CONN_TYPE_UNIX "unix" #define CONN_TYPE_TLS "tls" -#define CONN_TYPE_COMPRESSION "compression" #define CONN_TYPE_MAX 8 /* 8 is enough to be extendable */ typedef void (*ConnectionCallbackFunc)(struct connection *conn); @@ -82,8 +81,6 @@ typedef struct ConnectionType { ssize_t (*sync_write)(struct connection *conn, char *ptr, ssize_t size, long long timeout); ssize_t (*sync_read)(struct connection *conn, char *ptr, ssize_t size, long long timeout); ssize_t (*sync_readline)(struct connection *conn, char *ptr, ssize_t size, long long timeout); - size_t (*get_last_read)(struct connection *conn); - size_t (*get_last_written)(struct connection *conn); /* event loop */ void (*unbind_event_loop)(struct connection *conn); @@ -98,17 +95,6 @@ typedef struct ConnectionType { /* Get peer username based on connection type */ sds (*get_peer_username)(connection *conn); - - aeEventLoop* (*get_event_loop)(connection *conn); - void (*unset_event_loop)(connection *conn); - int (*get_fd)(connection *conn); - int (*get_iovcnt)(connection *conn); - ConnectionState (*get_state)(connection *conn); - int (*get_last_errno)(connection *conn); - int (*has_read_handler)(connection *conn); - int (*has_write_handler)(connection *conn); - void (*set_private_data)(connection *conn, void *data); - void* (*get_private_data)(connection *conn); } ConnectionType; struct connection { @@ -271,33 +257,13 @@ static inline ssize_t connSyncReadLine(connection *conn, char *ptr, ssize_t size return conn->type->sync_readline(conn, ptr, size, timeout); } -/* If connection type has a special way to get last read bytes from connection - * save the value in last_read and return 1. Otherwise return 0. - * F.e when compression is enabled we read compressed data from socket inside - * connRead but it returns the number of bytes decompressed. In order to get the - * number of bytes read from socket we call connCheckLastRead. */ -static inline int connCheckLastRead(connection *conn, size_t *last_read) { - if (!conn->type->get_last_read) return 0; - *last_read = conn->type->get_last_read(conn); - return 1; -} - -/* If connection type has a special way to get last written bytes to connection - * save the value in last_written and return 1. Otherwise return 0. */ -static inline int connCheckLastWritten(connection *conn, size_t *last_written) { - if (!conn->type->get_last_written) return 0; - *last_written = conn->type->get_last_written(conn); - return 1; -} - /* Return CONN_TYPE_* for the specified connection */ static inline const char *connGetType(connection *conn) { return conn->type->get_type(conn); } static inline int connLastErrorRetryable(connection *conn) { - int last_errno = conn->type->get_last_errno ? conn->type->get_last_errno(conn) : conn->last_errno; - return last_errno == EINTR; + return conn->last_errno == EINTR; } /* Get address information of a connection. @@ -348,52 +314,34 @@ static inline int connIsLocal(connection *conn) { return -1; } -static inline ConnectionState connGetState(connection *conn) { - return conn->type->get_state ? conn->type->get_state(conn) : conn->state; -} - -static inline int connGetFd(connection *conn) { - return conn->type->get_fd ? conn->type->get_fd(conn) : conn->fd; -} - -static inline int connGetIovcnt(connection *conn) { - return conn->type->get_iovcnt ? conn->type->get_iovcnt(conn) : conn->iovcnt; +static inline int connGetState(connection *conn) { + return conn->state; } /* Returns true if a write handler is registered */ static inline int connHasWriteHandler(connection *conn) { - return conn->type->has_write_handler ? - conn->type->has_write_handler(conn) : - (conn->write_handler != NULL); + return conn->write_handler != NULL; } /* Returns true if a read handler is registered */ static inline int connHasReadHandler(connection *conn) { - return conn->type->has_read_handler ? - conn->type->has_read_handler(conn) : - (conn->read_handler != NULL); + return conn->read_handler != NULL; } /* Returns true if the connection is bound to an event loop */ static inline int connHasEventLoop(connection *conn) { - return conn->type->get_event_loop ? - (conn->type->get_event_loop(conn) != NULL) : - (conn->el != NULL); + return conn->el != NULL; } /* Unbind the current event loop from the connection, so that it can be * rebind to a different event loop in the future. */ static inline void connUnbindEventLoop(connection *conn) { - if (!connHasEventLoop(conn)) return; + if (conn->el == NULL) return; connSetReadHandler(conn, NULL); connSetWriteHandler(conn, NULL); if (conn->type->unbind_event_loop) conn->type->unbind_event_loop(conn); - - if (conn->type->unset_event_loop) - conn->type->unset_event_loop(conn); - else - conn->el = NULL; + conn->el = NULL; } /* Rebind the connection to another event loop, read/write handlers must not @@ -404,17 +352,12 @@ static inline int connRebindEventLoop(connection *conn, aeEventLoop *el) { /* Associate a private data pointer with the connection */ static inline void connSetPrivateData(connection *conn, void *data) { - if (conn->type->set_private_data) - conn->type->set_private_data(conn, data); - else - conn->private_data = data; + conn->private_data = data; } /* Get the associated private data pointer */ static inline void *connGetPrivateData(connection *conn) { - return conn->type->get_private_data ? - conn->type->get_private_data(conn) : - conn->private_data; + return conn->private_data; } /* Return a text that describes the connection, suitable for inclusion @@ -423,8 +366,7 @@ static inline void *connGetPrivateData(connection *conn) { * For sockets, we always return "fd=" to maintain compatibility. */ static inline const char *connGetInfo(connection *conn, char *buf, size_t buf_len) { - int fd = conn ? connGetFd(conn) : -1; - snprintf(buf, buf_len-1, "fd=%i", conn == NULL ? -1 : fd); + snprintf(buf, buf_len-1, "fd=%i", conn == NULL ? -1 : conn->fd); return buf; } @@ -520,7 +462,6 @@ sds getListensInfoString(sds info); int RedisRegisterConnectionTypeSocket(void); int RedisRegisterConnectionTypeUnix(void); int RedisRegisterConnectionTypeTLS(void); -int RedisRegisterConnectionTypeCompression(void); /* Return 1 if connection is using TLS protocol, 0 if otherwise. */ static inline int connIsTLS(connection *conn) { diff --git a/src/iothread.c b/src/iothread.c index 41b08e9d205..80bf659415d 100644 --- a/src/iothread.c +++ b/src/iothread.c @@ -830,8 +830,13 @@ void IOThreadBeforeSleep(struct aeEventLoop *el) { /* Handle pending data(typical TLS). */ connTypeProcessPendingData(el); - /* If any connection type(typical TLS) still has pending unread data don't sleep at all. */ - int dont_sleep = connTypeHasPendingData(el); + /* Handle decompressed data that wasn't fully consumed by the last read + * event (e.g. the query buffer filled up). */ + int compression_pending = compressionProcessPendingReads(t->compression_clients); + + /* If any connection type(typical TLS) or compression client still has + * pending unread data don't sleep at all. */ + int dont_sleep = connTypeHasPendingData(el) || compression_pending > 0; /* Process clients from main thread, since the main thread may deliver clients * without notification during IO thread processing events. */ @@ -904,10 +909,11 @@ void IOThreadCompressionCron(IOThread *t) { if (c->io_flags & CLIENT_IO_CLOSE_ASAP) continue; serverAssert(c->compression_state); - /* Usually compressAndWrite will be called at the end of - * consumeAndTryWriteCompressed but when compression maximum latency ms - * have passed we want to force flush to the compression buffer so we - * don't have much delays between writes to the socket */ + /* Usually compressAndWrite is called via clientCompressAndWriteBuf + * when repl data is written to the replica, but when compression + * maximum latency ms have passed we want to force flush the + * compression buffer so we don't have much delays between writes to + * the socket */ if (clientHasPendingCompressionFlush(c)) { /* Only master/replica clients support client compression for now. */ serverAssert(c->flags & CLIENT_SLAVE); diff --git a/src/networking.c b/src/networking.c index f75751e38bf..6e3e77b94f1 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1914,7 +1914,7 @@ void unlinkClient(client *c) { * the main process is stale. SSL_shutdown() involves a handshake, * and it may block the caller when used with stale TLS state.*/ if (c->flags & CLIENT_REPL_RDB_CHANNEL) - shutdown(connGetFd(c->conn), SHUT_RDWR); + shutdown(c->conn->fd, SHUT_RDWR); else connShutdown(c->conn); } @@ -2565,7 +2565,7 @@ static payloadHeader *processSentDataInEncodedBuffer(client *c, char *start_ptr, * and 'nwritten' is an output parameter, it means how many bytes server write * to client. */ static int _writevToClient(client *c, ssize_t *nwritten) { - int iovmax = min(IOV_MAX, connGetIovcnt(c->conn)); + int iovmax = min(IOV_MAX, c->conn->iovcnt); struct iovec iov[iovmax]; ReplyIOV reply_iov = {iov, iovmax}; @@ -2730,30 +2730,36 @@ static inline int _writeToClientSlaveIOThread(client *c, ssize_t *nwritten) { size_t pos = c->io_curr_repl_node == c->io_bound_repl_node ? c->io_bound_block_pos : o->used; if (pos > c->io_curr_block_pos) { - int consumed = connWrite(c->conn, o->buf+c->io_curr_block_pos, - pos-c->io_curr_block_pos); - - if (consumed <= 0) return C_ERR; - - /* Note, that consumed is how much bytes we've read from the repl buffer, - * where as the bytes we've written into the socket may be different if - * connCheckLastWritten returns so (f.e compression case) */ - /* TODO: if compression is generalized for all types of clients we will - * need to add this check in writeToClientNonSlave also */ - size_t last_written = 0; - if (connCheckLastWritten(c->conn, &last_written)) { - *nwritten += last_written; + ssize_t consumed; + if (clientCompressesWrites(c)) { + /* Note, that consumed is how much bytes we've read from the repl + * buffer, whereas socket_written is the number of compressed bytes + * actually written to the socket, which would most certainly be a + * different number. */ + ssize_t socket_written = 0; + consumed = clientCompressAndWriteBuf(c, o->buf+c->io_curr_block_pos, + pos-c->io_curr_block_pos, + &socket_written); + if (consumed <= 0) { + if (consumed < 0) *nwritten = -1; + return C_ERR; + } + *nwritten += socket_written; /* Since nwritten stores the number of compressed bytes written to * socket we also store the uncompressed size for stats. */ atomicIncr(server.stat_net_repl_uncompressed_bytes, consumed); } else { + consumed = connWrite(c->conn, o->buf+c->io_curr_block_pos, + pos-c->io_curr_block_pos); + if (consumed <= 0) { + if (consumed < 0) *nwritten = -1; + return C_ERR; + } *nwritten += consumed; } /* Advance the block position with consumed, because that's how much - * bytes were read from the repl buffer node. *nwritten stores how - * much bytes were written to the socket, which for the compression - * case would most certainly be a different number. */ + * bytes were read from the repl buffer node. */ c->io_curr_block_pos += consumed; } /* If we fully sent the object and there are more nodes to send, go to the next one. */ @@ -2781,10 +2787,28 @@ static inline int _writeToClientSlave(client *c, ssize_t *nwritten) { serverAssert(o->used >= c->ref_block_pos); /* Send current block if it is not fully sent. */ if (o->used > c->ref_block_pos) { - *nwritten = connWrite(c->conn, o->buf+c->ref_block_pos, - o->used-c->ref_block_pos); - if (*nwritten <= 0) return C_ERR; - c->ref_block_pos += *nwritten; + ssize_t consumed; + if (clientCompressesWrites(c)) { + ssize_t socket_written = 0; + consumed = clientCompressAndWriteBuf(c, o->buf+c->ref_block_pos, + o->used-c->ref_block_pos, + &socket_written); + if (consumed <= 0) { + if (consumed < 0) *nwritten = -1; + return C_ERR; + } + *nwritten = socket_written; + atomicIncr(server.stat_net_repl_uncompressed_bytes, consumed); + } else { + consumed = connWrite(c->conn, o->buf+c->ref_block_pos, + o->used-c->ref_block_pos); + if (consumed <= 0) { + if (consumed < 0) *nwritten = -1; + return C_ERR; + } + *nwritten = consumed; + } + c->ref_block_pos += consumed; } /* If we fully sent the object on head, go to the next one. */ @@ -3940,12 +3964,25 @@ void readQueryFromClient(connection *conn) { /* Read as much as possible from the socket to save read(2) system calls. */ readlen = sdsavail(c->querybuf); } - nread = connRead(c->conn, c->querybuf+qblen, readlen); + size_t network_read = 0; + if (clientDecompressesReads(c)) { + /* nread is the number of decompressed bytes placed in the query + * buffer, whereas network_read is the number of (compressed) bytes + * actually read from the socket. */ + nread = clientReadAndDecompress(c, c->querybuf+qblen, readlen, &network_read); + } else { + nread = connRead(c->conn, c->querybuf+qblen, readlen); + if (nread > 0) network_read = nread; + } if (nread == -1) { - if (connGetState(conn) == CONN_STATE_CONNECTED) { + /* clientReadAndDecompress sets read_error on a decompression error, + * in which case the link is not recoverable even if the connection + * is still alive. */ + if (c->read_error == 0 && connGetState(conn) == CONN_STATE_CONNECTED) { goto done; } else { - c->read_error = CLIENT_READ_CONN_DISCONNECTED; + if (c->read_error == 0) + c->read_error = CLIENT_READ_CONN_DISCONNECTED; freeClientAsync(c); goto done; } @@ -3967,15 +4004,8 @@ void readQueryFromClient(connection *conn) { * c->lastinteraction will be updated during processClientsFromIOThread */ c->io_lastinteraction = server.unixtime; - size_t network_read; - if (!connCheckLastRead(c->conn, &network_read)) { - network_read = nread; - } else { - /* In case of compression nread is the number of decompressed bytes, - * whereas network_read stores the actual number of bytes read from - * socket. */ + if (clientDecompressesReads(c)) atomicIncr(server.stat_net_repl_decompressed_bytes, nread); - } if (c->flags & CLIENT_MASTER) { if (c->running_tid == IOTHREAD_MAIN_THREAD_ID) { diff --git a/src/socket.c b/src/socket.c index 4920ac3ff80..226b414f85e 100644 --- a/src/socket.c +++ b/src/socket.c @@ -423,8 +423,6 @@ static ConnectionType CT_Socket = { .sync_write = connSocketSyncWrite, .sync_read = connSocketSyncRead, .sync_readline = connSocketSyncReadLine, - .get_last_read = NULL, - .get_last_written = NULL, /* pending data */ .has_pending_data = NULL, @@ -432,43 +430,36 @@ static ConnectionType CT_Socket = { }; int connBlock(connection *conn) { - int fd = connGetFd(conn); - if (fd == -1) return C_ERR; - return anetBlock(NULL, fd); + if (conn->fd == -1) return C_ERR; + return anetBlock(NULL, conn->fd); } int connNonBlock(connection *conn) { - int fd = connGetFd(conn); - if (fd == -1) return C_ERR; - return anetNonBlock(NULL, fd); + if (conn->fd == -1) return C_ERR; + return anetNonBlock(NULL, conn->fd); } int connEnableTcpNoDelay(connection *conn) { - int fd = connGetFd(conn); - if (fd == -1) return C_ERR; - return anetEnableTcpNoDelay(NULL, fd); + if (conn->fd == -1) return C_ERR; + return anetEnableTcpNoDelay(NULL, conn->fd); } int connDisableTcpNoDelay(connection *conn) { - int fd = connGetFd(conn); - if (fd == -1) return C_ERR; - return anetDisableTcpNoDelay(NULL, fd); + if (conn->fd == -1) return C_ERR; + return anetDisableTcpNoDelay(NULL, conn->fd); } int connKeepAlive(connection *conn, int interval) { - int fd = connGetFd(conn); - if (fd == -1) return C_ERR; - return anetKeepAlive(NULL, fd, interval); + if (conn->fd == -1) return C_ERR; + return anetKeepAlive(NULL, conn->fd, interval); } int connSendTimeout(connection *conn, long long ms) { - int fd = connGetFd(conn); - return anetSendTimeout(NULL, fd, ms); + return anetSendTimeout(NULL, conn->fd, ms); } int connRecvTimeout(connection *conn, long long ms) { - int fd = connGetFd(conn); - return anetRecvTimeout(NULL, fd, ms); + return anetRecvTimeout(NULL, conn->fd, ms); } int RedisRegisterConnectionTypeSocket(void) diff --git a/src/tls.c b/src/tls.c index f506d327e13..0537dfef377 100644 --- a/src/tls.c +++ b/src/tls.c @@ -1235,8 +1235,6 @@ static ConnectionType CT_TLS = { .sync_write = connTLSSyncWrite, .sync_read = connTLSSyncRead, .sync_readline = connTLSSyncReadLine, - .get_last_read = NULL, - .get_last_written = NULL, /* pending data */ .has_pending_data = tlsHasPendingData, diff --git a/src/unix.c b/src/unix.c index 6aad6c91ad4..e26f0d27ad1 100644 --- a/src/unix.c +++ b/src/unix.c @@ -207,8 +207,6 @@ static ConnectionType CT_Unix = { .sync_write = connUnixSyncWrite, .sync_read = connUnixSyncRead, .sync_readline = connUnixSyncReadLine, - .get_last_read = NULL, - .get_last_written = NULL, /* pending data */ .has_pending_data = NULL,