-
Notifications
You must be signed in to change notification settings - Fork 0
Add auto-repair options for broken AOF tail on startup #20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: coderabbit_combined_20260121_augment_sentry_coderabbit_1_base_add_auto-repair_options_for_broken_aof_tail_on_startup_pr102
Are you sure you want to change the base?
Changes from all commits
e806a17
6d1baaa
acde9f1
6d9e785
c4a03b0
a8ba2bd
0cb9ced
b1fd5c2
56b5707
6c1bb33
63f818f
97c4edc
187f338
2290ae6
35a7595
6654b8d
fce074d
b2555bb
5bc675e
e73a7dc
a3698e2
a9b1640
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -3090,6 +3090,7 @@ standardConfig static_configs[] = { | |||||||||||||
| createBoolConfig("cluster-require-full-coverage", NULL, MODIFIABLE_CONFIG, server.cluster_require_full_coverage, 1, NULL, NULL), | ||||||||||||||
| createBoolConfig("rdb-save-incremental-fsync", NULL, MODIFIABLE_CONFIG, server.rdb_save_incremental_fsync, 1, NULL, NULL), | ||||||||||||||
| createBoolConfig("aof-load-truncated", NULL, MODIFIABLE_CONFIG, server.aof_load_truncated, 1, NULL, NULL), | ||||||||||||||
| createBoolConfig("aof-load-broken", NULL, MODIFIABLE_CONFIG, server.aof_load_broken, 0, NULL, NULL), | ||||||||||||||
| createBoolConfig("aof-use-rdb-preamble", NULL, MODIFIABLE_CONFIG, server.aof_use_rdb_preamble, 1, NULL, NULL), | ||||||||||||||
| createBoolConfig("aof-timestamp-enabled", NULL, MODIFIABLE_CONFIG, server.aof_timestamp_enabled, 0, NULL, NULL), | ||||||||||||||
| createBoolConfig("cluster-replica-no-failover", "cluster-slave-no-failover", MODIFIABLE_CONFIG, server.cluster_slave_no_failover, 0, NULL, updateClusterFlags), /* Failover by default. */ | ||||||||||||||
|
|
@@ -3254,6 +3255,7 @@ standardConfig static_configs[] = { | |||||||||||||
| createTimeTConfig("repl-backlog-ttl", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.repl_backlog_time_limit, 60*60, INTEGER_CONFIG, NULL, NULL), /* Default: 1 hour */ | ||||||||||||||
| createOffTConfig("auto-aof-rewrite-min-size", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.aof_rewrite_min_size, 64*1024*1024, MEMORY_CONFIG, NULL, NULL), | ||||||||||||||
| createOffTConfig("loading-process-events-interval-bytes", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 1024, INT_MAX, server.loading_process_events_interval_bytes, 1024*512, INTEGER_CONFIG, NULL, NULL), | ||||||||||||||
| createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.aof_load_broken_max_size, 4*1024, INTEGER_CONFIG, NULL, NULL), | ||||||||||||||
|
|
||||||||||||||
|
Comment on lines
3256
to
3259
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use a wider upper bound and allow size units. 🛠️ Proposed fix- createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.aof_load_broken_max_size, 4*1024, INTEGER_CONFIG, NULL, NULL),
+ createOffTConfig("aof-load-broken-max-size", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.aof_load_broken_max_size, 4*1024, MEMORY_CONFIG, NULL, NULL),📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||
| createIntConfig("tls-port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.tls_port, 0, INTEGER_CONFIG, NULL, applyTLSPort), /* TCP port. */ | ||||||||||||||
| createIntConfig("tls-session-cache-size", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.tls_ctx_config.session_cache_size, 20*1024, INTEGER_CONFIG, NULL, applyTlsCfg), | ||||||||||||||
|
|
||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -701,4 +701,140 @@ tags {"aof external:skip"} { | |
| assert_equal {1} [r get t] | ||
| } | ||
| } | ||
|
|
||
| # Check AOF load broken behavior | ||
| # Corrupted base AOF, existing AOF files | ||
| create_aof $aof_dirpath $aof_base_file { | ||
| append_to_aof [formatCommand set param ok] | ||
| append_to_aof "corruption" | ||
| } | ||
| create_aof $aof_dirpath $aof_file { | ||
| append_to_aof [formatCommand set foo hello] | ||
| } | ||
| start_server_aof_ex [list dir $server_path aof-load-broken yes] [list wait_ready false] { | ||
| test "Log should mention truncated file is not last" { | ||
| wait_for_log_messages 0 { | ||
| {*AOF loaded anyway because aof-load-broken is enabled*} | ||
| {*Fatal error: the truncated file is not the last file*} | ||
| } 0 10 1000 | ||
| } | ||
| } | ||
|
|
||
| # Remove all incr AOF files to make the base file being the last file | ||
| exec rm -f $aof_dirpath/appendonly.aof.* | ||
| start_server_aof [list dir $server_path aof-load-broken yes] { | ||
| test "Corrupted base AOF (last file): should recover" { | ||
| assert_equal 1 [is_alive [srv pid]] | ||
| } | ||
|
|
||
| test "param should be 'ok'" { | ||
| set client [redis [srv host] [srv port] 0 $::tls] | ||
| wait_done_loading $client | ||
| assert {[$client get param] eq "ok"} | ||
| } | ||
| } | ||
|
Comment on lines
+723
to
+735
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid deleting the base/manifest when making the base the last file
🧩 Suggested fix- exec rm -f $aof_dirpath/appendonly.aof.*
+ # Remove only INCR files and reset the manifest so the base is last.
+ exec rm -f $aof_dirpath/*$::incr_aof_sufix$::aof_format_suffix
+ create_aof_manifest $aof_dirpath $aof_manifest_file {
+ append_to_manifest "file ${aof_basename}.1$::base_aof_sufix$::aof_format_suffix seq 1 type b\n"
+ }🤖 Prompt for AI Agents |
||
|
|
||
| # Should also start with broken incr AOF. | ||
| create_aof $aof_dirpath $aof_file { | ||
| append_to_aof [formatCommand set foo 1] | ||
| append_to_aof [formatCommand incr foo] | ||
| append_to_aof [formatCommand incr foo] | ||
| append_to_aof [formatCommand incr foo] | ||
| append_to_aof [formatCommand incr foo] | ||
| append_to_aof "corruption" | ||
| } | ||
|
|
||
| start_server_aof [list dir $server_path aof-load-broken yes] { | ||
| test "Short read: Server should start if aof-load-broken is yes" { | ||
| assert_equal 1 [is_alive [srv pid]] | ||
| } | ||
|
|
||
| # The AOF file is expected to be correct because default value for aof-load-broken-max-size is 4096, | ||
| # so the AOF will reload without the corruption | ||
| test "Broken AOF loaded: we expect foo to be equal to 5" { | ||
| set client [redis [srv host] [srv port] 0 $::tls] | ||
| wait_done_loading $client | ||
| assert {[$client get foo] eq "5"} | ||
| } | ||
|
|
||
| test "Append a new command after loading an incomplete AOF" { | ||
| $client incr foo | ||
| } | ||
| } | ||
|
|
||
| start_server_aof [list dir $server_path aof-load-broken yes] { | ||
| test "Short read + command: Server should start" { | ||
| assert_equal 1 [is_alive [srv pid]] | ||
| } | ||
|
|
||
| test "Broken AOF loaded: we expect foo to be equal to 6 now" { | ||
| set client [redis [srv host] [srv port] 0 $::tls] | ||
| wait_done_loading $client | ||
| assert {[$client get foo] eq "6"} | ||
| } | ||
| } | ||
|
|
||
| # Test that the server exits when the AOF contains a format error | ||
| create_aof $aof_dirpath $aof_file { | ||
| append_to_aof [formatCommand set foo hello] | ||
| append_to_aof [string range [formatCommand incr foo] 0 end-3] | ||
| append_to_aof "corruption" | ||
| } | ||
|
|
||
| # We set the maximum allowed corrupted size to 2 bytes, but the actual corrupted portion is larger, | ||
| # so the AOF file will not be reloaded. | ||
| start_server_aof_ex [list dir $server_path aof-load-broken yes aof-load-broken-max-size 2] [list wait_ready false] { | ||
| test "Bad format: Server should have logged an error" { | ||
| wait_for_log_messages 0 {"*AOF was not loaded because the size*"} 0 10 1000 | ||
| } | ||
| } | ||
|
|
||
| create_aof_manifest $aof_dirpath $aof_manifest_file { | ||
| append_to_manifest "file appendonly.aof.1.base.aof seq 1 type b\n" | ||
| append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n" | ||
| append_to_manifest "file appendonly.aof.2.incr.aof seq 2 type i\n" | ||
| } | ||
| # Create base AOF file | ||
| set base_aof_file "$aof_dirpath/appendonly.aof.1.base.aof" | ||
| create_aof $aof_dirpath $base_aof_file { | ||
| append_to_aof [formatCommand set fo base] | ||
| } | ||
|
|
||
| # Create middle incr AOF file with corruption | ||
| set mid_aof_file "$aof_dirpath/appendonly.aof.1.incr.aof" | ||
| create_aof $aof_dirpath $mid_aof_file { | ||
| append_to_aof [formatCommand set fo mid] | ||
| append_to_aof "CORRUPTION" | ||
| } | ||
|
|
||
| # Create last incr AOF file (valid) | ||
| set last_aof_file "$aof_dirpath/appendonly.aof.2.incr.aof" | ||
| create_aof $aof_dirpath $last_aof_file { | ||
| append_to_aof [formatCommand set fo last] | ||
| } | ||
|
|
||
| # Check that Redis fails to load because corruption is in the middle file | ||
| start_server_aof_ex [list dir $server_path aof-load-broken yes] [list wait_ready false] { | ||
| test "Intermediate AOF is broken: should log fatal and not start" { | ||
| wait_for_log_messages 0 { | ||
| {*Fatal error: the truncated file is not the last file*} | ||
| } 0 10 1000 | ||
| } | ||
| } | ||
|
|
||
| # Swap mid and last files | ||
| set tmp_file "$aof_dirpath/temp.aof" | ||
| file rename -force $mid_aof_file $tmp_file | ||
| file rename -force $last_aof_file $mid_aof_file | ||
| file rename -force $tmp_file $last_aof_file | ||
|
|
||
| # Should now start successfully since corruption is in last AOF file | ||
| start_server_aof [list dir $server_path aof-load-broken yes] { | ||
| test "Corrupted last AOF file: Server should still start and recover" { | ||
| assert_equal 1 [is_alive [srv pid]] | ||
| set client [redis [srv host] [srv port] 0 $::tls] | ||
| wait_done_loading $client | ||
| assert {[$client get fo] eq "mid"} | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: qodo-benchmark/redis-combined-coderabbit
Length of output: 1909
🏁 Script executed:
Repository: qodo-benchmark/redis-combined-coderabbit
Length of output: 2857
🏁 Script executed:
Repository: qodo-benchmark/redis-combined-coderabbit
Length of output: 2937
Include
AOF_BROKEN_RECOVEREDin the stopLoading success conditionAt line 1894,
stopLoading()is called withret == AOF_OK || ret == AOF_TRUNCATED, but this omitsAOF_BROKEN_RECOVERED. The caller inserver.c:7036treatsAOF_BROKEN_RECOVEREDas a successful recovery (doesn't exit, logs success), yetstopLoading(0)firesREDISMODULE_SUBEVENT_LOADING_FAILEDto modules. This inconsistency can cause modules to react incorrectly to what is actually a successful recovery.Fix
This also applies to the corresponding code at lines 1860-1873.
🤖 Prompt for AI Agents