Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions redis.conf
Original file line number Diff line number Diff line change
Expand Up @@ -1529,6 +1529,25 @@ auto-aof-rewrite-min-size 64mb
# will be found.
aof-load-truncated yes

# When the AOF file is corrupted in the middle (format errors), Redis can
# attempt to automatically recover by truncating the corrupted portion if
# it's smaller than the configured maximum size. This is more aggressive
# than aof-load-truncated which only handles truncation at the end of files.
#
# The aof-load-broken-max-size setting controls the maximum size in bytes
# of corrupted data that can be automatically truncated.
#
# If aof-load-broken is set to yes and the corrupted portion is smaller than
# aof-load-broken-max-size, Redis will truncate the corrupted data and start
# normally, logging a warning about the recovery. Otherwise, the server will
# exit with an error and require manual intervention using "redis-check-aof".
#
# This option is disabled by default since automatically truncating corrupted
# data can lead to data loss. Only enable this if you understand the risks
# and prefer availability over data integrity in corruption scenarios.
aof-load-broken no
aof-load-broken-max-size 4096

# Redis can create append-only base files in either RDB or AOF formats. Using
# the RDB format is always faster and more efficient, and disabling it is only
# supported for backward compatibility purposes.
Expand Down
49 changes: 41 additions & 8 deletions src/aof.c
Original file line number Diff line number Diff line change
Expand Up @@ -1658,7 +1658,7 @@ int loadSingleAppendOnlyFile(char *filename) {
/* Clean up. Command code may have changed argv/argc so we use the
* argv/argc of the client instead of the local variables. */
freeClientArgv(fakeClient);
if (server.aof_load_truncated) valid_up_to = ftello(fp);
if (server.aof_load_truncated || server.aof_load_broken) valid_up_to = ftello(fp);
if (server.key_load_delay)
debugDelay(server.key_load_delay);
}
Expand Down Expand Up @@ -1719,8 +1719,41 @@ int loadSingleAppendOnlyFile(char *filename) {
goto cleanup;

fmterr: /* Format error. */
serverLog(LL_WARNING, "Bad file format reading the append only file %s: "
"make a backup of your AOF file, then use ./redis-check-aof --fix <filename.manifest>", filename);
/* fmterr may be caused by accidentally machine shutdown, so if the broken tail
* is less than a specified size, try to recover it automatically */
if (server.aof_load_broken) {
if (valid_up_to == -1) {
serverLog(LL_WARNING,"Last valid command offset is invalid");
} else if ((size_t)(sb.st_size - valid_up_to) < (size_t)server.aof_load_broken_max_size) {
if (truncate(aof_filepath,valid_up_to) == -1) {
serverLog(LL_WARNING,"Error truncating the AOF file: %s",
strerror(errno));
} else {
/* Make sure the AOF file descriptor points to the end of the
* file after the truncate call. */
if (server.aof_fd != -1 && lseek(server.aof_fd,0,SEEK_END) == -1) {
serverLog(LL_WARNING,"Can't seek the end of the AOF file: %s",
strerror(errno));
} else {
serverLog(LL_WARNING,
"AOF loaded anyway because aof-load-broken is enabled and "
"broken size '%lld' is less than aof-load-broken-max-size '%lld'",
(long long)(sb.st_size - valid_up_to), (long long)(server.aof_load_broken_max_size));
ret = AOF_BROKEN_RECOVERED;
goto loaded_ok;
}
}
} else { /* The size of the corrupted portion exceeds the configured limit. */
serverLog(LL_WARNING,
"AOF was not loaded because the size of the corrupted portion "
"exceeds the configured limit. aof-load-broken is enabled and broken size '%lld' "
"is bigger than aof-load-broken-max-size '%lld'",
(long long)(sb.st_size - valid_up_to), (long long)(server.aof_load_broken_max_size));
}
} else {
serverLog(LL_WARNING, "Bad file format reading the append only file %s: "
"make a backup of your AOF file, then use ./redis-check-aof --fix <filename.manifest>", filename);
}
ret = AOF_FAILED;
/* fall through to cleanup. */

Expand Down Expand Up @@ -1794,18 +1827,18 @@ int loadAppendOnlyFiles(aofManifest *am) {
last_file = ++aof_num == total_num;
start = ustime();
ret = loadSingleAppendOnlyFile(aof_name);
if (ret == AOF_OK || (ret == AOF_TRUNCATED && last_file)) {
if (ret == AOF_OK || ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && last_file)) {
serverLog(LL_NOTICE, "DB loaded from base file %s: %.3f seconds",
aof_name, (float)(ustime()-start)/1000000);
}

/* If the truncated file is not the last file, we consider this to be a fatal error. */
if (ret == AOF_TRUNCATED && !last_file) {
if ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && !last_file) {
ret = AOF_FAILED;
serverLog(LL_WARNING, "Fatal error: the truncated file is not the last file");
}

if (ret == AOF_OPEN_ERR || ret == AOF_FAILED) {
if (ret == AOF_OPEN_ERR || ret == AOF_FAILED || ret == AOF_BROKEN_RECOVERED) {
goto cleanup;
Comment on lines +1830 to 1842

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check AOF_BROKEN_RECOVERED usage and related functions
echo "=== AOF_BROKEN_RECOVERED occurrences ==="
rg -n "AOF_BROKEN_RECOVERED" src

echo ""
echo "=== loadAppendOnlyFiles callers ==="
rg -n "loadAppendOnlyFiles\(" src

echo ""
echo "=== stopLoading usage ==="
rg -n "stopLoading\(" src

echo ""
echo "=== Check AOF return value enum ==="
rg -n "AOF_OK|AOF_TRUNCATED|AOF_FAILED|AOF_OPEN_ERR" src/aof.h

Repository: qodo-benchmark/redis-combined-coderabbit

Length of output: 1909


🏁 Script executed:

#!/bin/bash
# Check context around stopLoading at line 1894
echo "=== Context around stopLoading at aof.c:1894 ==="
sed -n '1880,1900p' src/aof.c

echo ""
echo "=== Check loadAppendOnlyFiles caller in server.c:7036 ==="
sed -n '7030,7050p' src/server.c

echo ""
echo "=== Check if AOF_BROKEN_RECOVERED should be treated as success ==="
sed -n '1740,1750p' src/aof.c

Repository: qodo-benchmark/redis-combined-coderabbit

Length of output: 2857


🏁 Script executed:

#!/bin/bash
# Check stopLoading implementation
echo "=== stopLoading function ==="
sed -n '3228,3260p' src/rdb.c

echo ""
echo "=== Look for server.loading usage that might be affected ==="
rg -n "server\.loading" src/server.c | head -20

Repository: qodo-benchmark/redis-combined-coderabbit

Length of output: 2937


Include AOF_BROKEN_RECOVERED in the stopLoading success condition

At line 1894, stopLoading() is called with ret == AOF_OK || ret == AOF_TRUNCATED, but this omits AOF_BROKEN_RECOVERED. The caller in server.c:7036 treats AOF_BROKEN_RECOVERED as a successful recovery (doesn't exit, logs success), yet stopLoading(0) fires REDISMODULE_SUBEVENT_LOADING_FAILED to modules. This inconsistency can cause modules to react incorrectly to what is actually a successful recovery.

Fix
-    stopLoading(ret == AOF_OK || ret == AOF_TRUNCATED);
+    stopLoading(ret == AOF_OK || ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED);

This also applies to the corresponding code at lines 1860-1873.

🤖 Prompt for AI Agents
In `@src/aof.c` around lines 1830 - 1842, The stopLoading call currently treats
only AOF_OK and AOF_TRUNCATED as success but omits AOF_BROKEN_RECOVERED, causing
stopLoading(0) to emit REDISMODULE_SUBEVENT_LOADING_FAILED for a recovery that
should be considered successful; update the condition(s) where stopLoading is
invoked (look for calls using the local ret variable and checks against AOF_OK
|| AOF_TRUNCATED) to include AOF_BROKEN_RECOVERED (i.e., treat ret == AOF_OK ||
ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED as success), and make this
change for both corresponding code paths in aof.c so modules receive the correct
loading event.

}
}
Expand All @@ -1824,7 +1857,7 @@ int loadAppendOnlyFiles(aofManifest *am) {
last_file = ++aof_num == total_num;
start = ustime();
ret = loadSingleAppendOnlyFile(aof_name);
if (ret == AOF_OK || (ret == AOF_TRUNCATED && last_file)) {
if (ret == AOF_OK || ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && last_file)) {
serverLog(LL_NOTICE, "DB loaded from incr file %s: %.3f seconds",
aof_name, (float)(ustime()-start)/1000000);
}
Expand All @@ -1834,7 +1867,7 @@ int loadAppendOnlyFiles(aofManifest *am) {
if (ret == AOF_EMPTY) ret = AOF_OK;

/* If the truncated file is not the last file, we consider this to be a fatal error. */
if (ret == AOF_TRUNCATED && !last_file) {
if ((ret == AOF_TRUNCATED || ret == AOF_BROKEN_RECOVERED) && !last_file) {
ret = AOF_FAILED;
serverLog(LL_WARNING, "Fatal error: the truncated file is not the last file");
}
Expand Down
2 changes: 2 additions & 0 deletions src/config.c
Original file line number Diff line number Diff line change
Expand Up @@ -3090,6 +3090,7 @@ standardConfig static_configs[] = {
createBoolConfig("cluster-require-full-coverage", NULL, MODIFIABLE_CONFIG, server.cluster_require_full_coverage, 1, NULL, NULL),
createBoolConfig("rdb-save-incremental-fsync", NULL, MODIFIABLE_CONFIG, server.rdb_save_incremental_fsync, 1, NULL, NULL),
createBoolConfig("aof-load-truncated", NULL, MODIFIABLE_CONFIG, server.aof_load_truncated, 1, NULL, NULL),
createBoolConfig("aof-load-broken", NULL, MODIFIABLE_CONFIG, server.aof_load_broken, 0, NULL, NULL),
createBoolConfig("aof-use-rdb-preamble", NULL, MODIFIABLE_CONFIG, server.aof_use_rdb_preamble, 1, NULL, NULL),
createBoolConfig("aof-timestamp-enabled", NULL, MODIFIABLE_CONFIG, server.aof_timestamp_enabled, 0, NULL, NULL),
createBoolConfig("cluster-replica-no-failover", "cluster-slave-no-failover", MODIFIABLE_CONFIG, server.cluster_slave_no_failover, 0, NULL, updateClusterFlags), /* Failover by default. */
Expand Down Expand Up @@ -3254,6 +3255,7 @@ standardConfig static_configs[] = {
createTimeTConfig("repl-backlog-ttl", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.repl_backlog_time_limit, 60*60, INTEGER_CONFIG, NULL, NULL), /* Default: 1 hour */
createOffTConfig("auto-aof-rewrite-min-size", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.aof_rewrite_min_size, 64*1024*1024, MEMORY_CONFIG, NULL, NULL),
createOffTConfig("loading-process-events-interval-bytes", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 1024, INT_MAX, server.loading_process_events_interval_bytes, 1024*512, INTEGER_CONFIG, NULL, NULL),
createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.aof_load_broken_max_size, 4*1024, INTEGER_CONFIG, NULL, NULL),

Comment on lines 3256 to 3259

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Use a wider upper bound and allow size units.
Line 3258 uses LONG_MAX (can be 32‑bit) and INTEGER_CONFIG, which may reject large values and common unit suffixes for a byte-size setting. Prefer LLONG_MAX and MEMORY_CONFIG to align with other size configs and avoid unintended caps.

🛠️ Proposed fix
-    createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.aof_load_broken_max_size, 4*1024, INTEGER_CONFIG, NULL, NULL),
+    createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.aof_load_broken_max_size, 4*1024, MEMORY_CONFIG, NULL, NULL),
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
createOffTConfig("auto-aof-rewrite-min-size", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.aof_rewrite_min_size, 64*1024*1024, MEMORY_CONFIG, NULL, NULL),
createOffTConfig("loading-process-events-interval-bytes", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 1024, INT_MAX, server.loading_process_events_interval_bytes, 1024*512, INTEGER_CONFIG, NULL, NULL),
createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.aof_load_broken_max_size, 4*1024, INTEGER_CONFIG, NULL, NULL),
createOffTConfig("auto-aof-rewrite-min-size", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.aof_rewrite_min_size, 64*1024*1024, MEMORY_CONFIG, NULL, NULL),
createOffTConfig("loading-process-events-interval-bytes", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 1024, INT_MAX, server.loading_process_events_interval_bytes, 1024*512, INTEGER_CONFIG, NULL, NULL),
createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.aof_load_broken_max_size, 4*1024, MEMORY_CONFIG, NULL, NULL),
🤖 Prompt for AI Agents
In `@src/config.c` around lines 3256 - 3259, The "aof-load-broken-max-size" config
created by createOffTConfig uses LONG_MAX and INTEGER_CONFIG which can cap
values and reject unit suffixes; update the createOffTConfig call for
"aof-load-broken-max-size" to use LLONG_MAX as the upper bound and MEMORY_CONFIG
as the type (matching server.aof_load_broken_max_size semantics) so large values
and byte-unit suffixes are accepted.

createIntConfig("tls-port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.tls_port, 0, INTEGER_CONFIG, NULL, applyTLSPort), /* TCP port. */
createIntConfig("tls-session-cache-size", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.tls_ctx_config.session_cache_size, 20*1024, INTEGER_CONFIG, NULL, applyTlsCfg),
Expand Down
7 changes: 5 additions & 2 deletions src/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
* Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information.
*/

#ifndef __REDIS_H
#define __REDIS_H
#ifndef _REDIS_H
#define _REDIS_H

#include "fmacros.h"
#include "config.h"
Expand Down Expand Up @@ -345,6 +345,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT];
#define AOF_OPEN_ERR 3
#define AOF_FAILED 4
#define AOF_TRUNCATED 5
#define AOF_BROKEN_RECOVERED 6

/* RDB return values for rdbLoad. */
#define RDB_OK 0
Expand Down Expand Up @@ -2006,6 +2007,8 @@ struct redisServer {
int aof_last_write_status; /* C_OK or C_ERR */
int aof_last_write_errno; /* Valid if aof write/fsync status is ERR */
int aof_load_truncated; /* Don't stop on unexpected AOF EOF. */
int aof_load_broken; /* Don't stop on bad fmt. */
off_t aof_load_broken_max_size; /* The max size of broken AOF tail than can be ignored. */
int aof_use_rdb_preamble; /* Specify base AOF to use RDB encoding on AOF rewrites. */
redisAtomic int aof_bio_fsync_status; /* Status of AOF fsync in bio job. */
redisAtomic int aof_bio_fsync_errno; /* Errno of AOF fsync in bio job. */
Expand Down
136 changes: 136 additions & 0 deletions tests/integration/aof.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -701,4 +701,140 @@ tags {"aof external:skip"} {
assert_equal {1} [r get t]
}
}

# Check AOF load broken behavior
# Corrupted base AOF, existing AOF files
create_aof $aof_dirpath $aof_base_file {
append_to_aof [formatCommand set param ok]
append_to_aof "corruption"
}
create_aof $aof_dirpath $aof_file {
append_to_aof [formatCommand set foo hello]
}
start_server_aof_ex [list dir $server_path aof-load-broken yes] [list wait_ready false] {
test "Log should mention truncated file is not last" {
wait_for_log_messages 0 {
{*AOF loaded anyway because aof-load-broken is enabled*}
{*Fatal error: the truncated file is not the last file*}
} 0 10 1000
}
}

# Remove all incr AOF files to make the base file being the last file
exec rm -f $aof_dirpath/appendonly.aof.*
start_server_aof [list dir $server_path aof-load-broken yes] {
test "Corrupted base AOF (last file): should recover" {
assert_equal 1 [is_alive [srv pid]]
}

test "param should be 'ok'" {
set client [redis [srv host] [srv port] 0 $::tls]
wait_done_loading $client
assert {[$client get param] eq "ok"}
}
}
Comment on lines +723 to +735

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Avoid deleting the base/manifest when making the base the last file

rm -f $aof_dirpath/appendonly.aof.* also matches the base and manifest, which would leave no recoverable AOF for the next assertions. Prefer deleting only INCR files and rewrite the manifest to a base-only entry.

🧩 Suggested fix
-    exec rm -f $aof_dirpath/appendonly.aof.*
+    # Remove only INCR files and reset the manifest so the base is last.
+    exec rm -f $aof_dirpath/*$::incr_aof_sufix$::aof_format_suffix
+    create_aof_manifest $aof_dirpath $aof_manifest_file {
+        append_to_manifest "file ${aof_basename}.1$::base_aof_sufix$::aof_format_suffix seq 1 type b\n"
+    }
🤖 Prompt for AI Agents
In `@tests/integration/aof.tcl` around lines 723 - 735, The current rm -f
$aof_dirpath/appendonly.aof.* will remove the base and manifest too; change the
cleanup to only remove INCR files (e.g. target filenames matching the INCR
suffix pattern while excluding appendonly.aof and the manifest) and ensure the
AOF manifest is rewritten so it points only to the base file (making the base
the last file) before calling start_server_aof; update the shell command that
removes files and add a small step to recreate/update the manifest (referencing
aof_dirpath, the appendonly.aof naming, and the
start_server_aof/wait_done_loading test block) so the tests assert against a
recoverable base-only AOF.


# Should also start with broken incr AOF.
create_aof $aof_dirpath $aof_file {
append_to_aof [formatCommand set foo 1]
append_to_aof [formatCommand incr foo]
append_to_aof [formatCommand incr foo]
append_to_aof [formatCommand incr foo]
append_to_aof [formatCommand incr foo]
append_to_aof "corruption"
}

start_server_aof [list dir $server_path aof-load-broken yes] {
test "Short read: Server should start if aof-load-broken is yes" {
assert_equal 1 [is_alive [srv pid]]
}

# The AOF file is expected to be correct because default value for aof-load-broken-max-size is 4096,
# so the AOF will reload without the corruption
test "Broken AOF loaded: we expect foo to be equal to 5" {
set client [redis [srv host] [srv port] 0 $::tls]
wait_done_loading $client
assert {[$client get foo] eq "5"}
}

test "Append a new command after loading an incomplete AOF" {
$client incr foo
}
}

start_server_aof [list dir $server_path aof-load-broken yes] {
test "Short read + command: Server should start" {
assert_equal 1 [is_alive [srv pid]]
}

test "Broken AOF loaded: we expect foo to be equal to 6 now" {
set client [redis [srv host] [srv port] 0 $::tls]
wait_done_loading $client
assert {[$client get foo] eq "6"}
}
}

# Test that the server exits when the AOF contains a format error
create_aof $aof_dirpath $aof_file {
append_to_aof [formatCommand set foo hello]
append_to_aof [string range [formatCommand incr foo] 0 end-3]
append_to_aof "corruption"
}

# We set the maximum allowed corrupted size to 2 bytes, but the actual corrupted portion is larger,
# so the AOF file will not be reloaded.
start_server_aof_ex [list dir $server_path aof-load-broken yes aof-load-broken-max-size 2] [list wait_ready false] {
test "Bad format: Server should have logged an error" {
wait_for_log_messages 0 {"*AOF was not loaded because the size*"} 0 10 1000
}
}

create_aof_manifest $aof_dirpath $aof_manifest_file {
append_to_manifest "file appendonly.aof.1.base.aof seq 1 type b\n"
append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n"
append_to_manifest "file appendonly.aof.2.incr.aof seq 2 type i\n"
}
# Create base AOF file
set base_aof_file "$aof_dirpath/appendonly.aof.1.base.aof"
create_aof $aof_dirpath $base_aof_file {
append_to_aof [formatCommand set fo base]
}

# Create middle incr AOF file with corruption
set mid_aof_file "$aof_dirpath/appendonly.aof.1.incr.aof"
create_aof $aof_dirpath $mid_aof_file {
append_to_aof [formatCommand set fo mid]
append_to_aof "CORRUPTION"
}

# Create last incr AOF file (valid)
set last_aof_file "$aof_dirpath/appendonly.aof.2.incr.aof"
create_aof $aof_dirpath $last_aof_file {
append_to_aof [formatCommand set fo last]
}

# Check that Redis fails to load because corruption is in the middle file
start_server_aof_ex [list dir $server_path aof-load-broken yes] [list wait_ready false] {
test "Intermediate AOF is broken: should log fatal and not start" {
wait_for_log_messages 0 {
{*Fatal error: the truncated file is not the last file*}
} 0 10 1000
}
}

# Swap mid and last files
set tmp_file "$aof_dirpath/temp.aof"
file rename -force $mid_aof_file $tmp_file
file rename -force $last_aof_file $mid_aof_file
file rename -force $tmp_file $last_aof_file

# Should now start successfully since corruption is in last AOF file
start_server_aof [list dir $server_path aof-load-broken yes] {
test "Corrupted last AOF file: Server should still start and recover" {
assert_equal 1 [is_alive [srv pid]]
set client [redis [srv host] [srv port] 0 $::tls]
wait_done_loading $client
assert {[$client get fo] eq "mid"}
}
}
}
Loading