From e2c63a7062e3acbe44b8f7122c347183b87cbd89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 20 Jan 2026 12:41:27 +0200 Subject: [PATCH] MDEV-38595: Simplify InnoDB doublewrite buffer creation buf_dblwr_t::create(): Create the doublewrite buffer in a single atomic mini-transaction. Do not write any log records for initializing any doublewrite buffer pages, in order to avoid recovery failure with innodb_log_archive=ON starting from the very beginning. The mtr.commit() in buf_dblwr_t::create() was observed to comprise 295 mtr_t::m_memo entries: 1 entry for the fil_system.sys_space and the rest split between page 5 (TRX_SYS) and page 0 (allocation metadata). We are nowhere near the sux_lock::RECURSIVE_MAX limit of 65535 per page descriptor. Reviewed by: Thirunarayanan Balathandayuthapani Tested by: Saahil Alam --- storage/innobase/buf/buf0dblwr.cc | 98 ++++++++++-------------------- storage/innobase/include/mtr0mtr.h | 3 + 2 files changed, 35 insertions(+), 66 deletions(-) diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 3f26fe23ceecf..e32b68ed52d7a 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -93,15 +93,17 @@ bool buf_dblwr_t::create() noexcept mtr.start(); dberr_t err; - buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr); + buf_block_t *const trx_sys_block= buf_dblwr_trx_sys_get(&mtr); if (!trx_sys_block) { mtr.commit(); return false; } - if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + - trx_sys_block->page.frame) == + byte *const fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + + trx_sys_block->page.frame; + + if (mach_read_from_4(fseg_header + FSEG_HEADER_SIZE) == TRX_SYS_DOUBLEWRITE_MAGIC_N) { /* The doublewrite buffer has already been created: just read in @@ -135,21 +137,21 @@ bool buf_dblwr_t::create() noexcept sql_print_information("InnoDB: Doublewrite buffer not found:" " creating new"); - /* FIXME: After this point, the doublewrite buffer creation - is not atomic. The doublewrite buffer should not exist in + /* FIXME: The doublewrite buffer should not exist in the InnoDB system tablespace file in the first place. It could be located in separate optional file(s) in a user-specified location. */ } - byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + - trx_sys_block->page.frame; + mtr_t init_mtr; + init_mtr.start(); + for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE; i < 2 * size + extent_size / 2; i++) { buf_block_t *new_block= fseg_alloc_free_page_general(fseg_header, prev_page_no + 1, FSP_UP, - false, &mtr, &mtr, &err); + false, &mtr, &init_mtr, &err); if (!new_block) { sql_print_error("InnoDB: Cannot create doublewrite buffer: " @@ -164,75 +166,39 @@ bool buf_dblwr_t::create() noexcept return false; } - /* We read the allocated pages to the buffer pool; when they are - written to disk in a flush, the space id and page number fields - are also written to the pages. When we at database startup read - pages from the doublewrite buffer, we know that if the space id - and page number in them are the same as the page position in the - tablespace, then the page has not been written to in - doublewrite. */ - - ut_ad(new_block->page.lock.not_recursive()); const page_id_t id= new_block->page.id(); - /* We only do this in the debug build, to ensure that the check in - buf_flush_init_for_writing() will see a valid page type. The - flushes of new_block are actually unnecessary here. */ - ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->page.frame, - FIL_PAGE_TYPE_SYS)); + /* Normally, allocated pages will be modified further. However, + the pages of the doublewrite buffer are just dummy storage, not + covered by the write-ahead log. */ + ut_ad(init_mtr.get_savepoint() == 1); + ut_ad(init_mtr.m_memo[0].object == new_block); + ut_ad(init_mtr.m_memo[0].type == MTR_MEMO_PAGE_X_MODIFY); + init_mtr.m_memo[0].type= MTR_MEMO_PAGE_X_FIX; + init_mtr.rollback_to_savepoint(0, 1); + init_mtr.m_log.erase(); if (i == size / 2) - { ut_a(id.page_no() == size); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 + - trx_sys_block->page.frame, id.page_no()); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + - TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->page.frame, - id.page_no()); - } else if (i == size / 2 + size) - { ut_a(id.page_no() == 2 * size); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 + - trx_sys_block->page.frame, id.page_no()); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + - TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->page.frame, - id.page_no()); - } else if (i > size / 2) ut_a(id.page_no() == prev_page_no + 1); - - if (((i + 1) & 15) == 0) { - /* rw_locks can only be recursively x-locked 2048 times. (on 32 - bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a - negative number, and thus lock_word becomes like a shared lock). - For 4k page size this loop will lock the fseg header too many - times. Since this code is not done while any other threads are - active, restart the MTR occasionally. */ - mtr.commit(); - mtr.start(); - trx_sys_block= buf_dblwr_trx_sys_get(&mtr); - fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + - trx_sys_block->page.frame; - } - prev_page_no= id.page_no(); } - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + - trx_sys_block->page.frame, TRX_SYS_DOUBLEWRITE_MAGIC_N); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + - TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->page.frame, - TRX_SYS_DOUBLEWRITE_MAGIC_N); - - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED + - trx_sys_block->page.frame, + ut_ad(init_mtr.is_empty()); + byte *const doublewrite= fseg_header + + (TRX_SYS_DOUBLEWRITE_MAGIC - TRX_SYS_DOUBLEWRITE_FSEG); + mtr.write<4>(*trx_sys_block, doublewrite, TRX_SYS_DOUBLEWRITE_MAGIC_N); + static_assert(TRX_SYS_DOUBLEWRITE_BLOCK1==TRX_SYS_DOUBLEWRITE_MAGIC + 4, ""); + mtr.write<4>(*trx_sys_block, doublewrite + 4, size); + static_assert(TRX_SYS_DOUBLEWRITE_BLOCK2==TRX_SYS_DOUBLEWRITE_MAGIC + 8, ""); + mtr.write<4>(*trx_sys_block, doublewrite + 8, size * 2); + static_assert(TRX_SYS_DOUBLEWRITE_REPEAT == 12, ""); + mtr.memcpy(*trx_sys_block, doublewrite + 12, doublewrite, 12); + static_assert(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED == + 24 + TRX_SYS_DOUBLEWRITE_MAGIC, ""); + mtr.write<4>(*trx_sys_block, doublewrite + 24, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N); mtr.commit(); diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 3828a31cdb518..f0395748c8b6d 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -63,10 +63,13 @@ struct mtr_memo_slot_t void release() const; }; +class buf_dblwr_t; + /** Mini-transaction handle and buffer */ struct mtr_t { mtr_t(); ~mtr_t(); + friend buf_dblwr_t; /** Start a mini-transaction. */ void start();